diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard index 38094021c889611792b0041abf55b81f1dc04966..a5efadeb97ccc41449dc32a2c1dfcdfcf9fceac5 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard @@ -19,10 +19,10 @@ - + - - + @@ -203,7 +203,6 @@ - diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift index 640f51dafac5b684eda9cf002395ef3fe15e5d5f..0be1a9ee15d656c3338b18cde32ecadb6277e990 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift @@ -33,7 +33,7 @@ class MobileNet_ssd_hand: Net{ return " \(res)" } - func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] { + func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] { guard let interRes = paddleMobileRes.intermediateResults else { fatalError(" need have inter result ") @@ -47,13 +47,17 @@ class MobileNet_ssd_hand: Net{ fatalError() } - var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.originDim[0], h: score.originDim[1], w: score.originDim[2], c: score.originDim[3])) + var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3])) + print("score: ") + print(scoreFormatArr.strideArray()) var bboxArr = bbox.metalTexture.float32Array() + print("bbox: ") + print(bboxArr.strideArray()) let nmsCompute = NMSCompute.init() nmsCompute.scoreThredshold = 0.01 - nmsCompute.nmsTopK = 200 + nmsCompute.nmsTopK = 400 nmsCompute.keepTopK = 200 nmsCompute.nmsEta = 1.0 nmsCompute.nmsThreshold = 0.45 @@ -68,6 +72,7 @@ class MobileNet_ssd_hand: Net{ let output: [Float32] = result.map { $0.floatValue } + return output } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Net.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Net.swift index d7738a30a285a586ca98670f3a759cf37fcb5bc5..c643b4b63d263b2020170483c955bea4d6b6e404 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Net.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Net.swift @@ -37,7 +37,7 @@ protocol Net { var preprocessKernel: CusomKernel { get } func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) func resultStr(res: [Float]) -> String - func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] + func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] mutating func load() throws func predict(inTexture: MTLTexture, completion: @escaping ((time:TimeInterval, resultArray: [Float32])) -> Void) throws @@ -82,7 +82,7 @@ extension Net { } } - func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] { + func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] { return paddleMobileRes.resultArr } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift index 5032ef7d68f406b9adfa2baa04f6e5e754c55ff4..ecffd6af6d86b37492c0718acab77a712b9a8f33 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift @@ -19,17 +19,17 @@ import MetalPerformanceShaders let threadSupport = [1] -let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()] +let modelHelperMap: [SupportModel : Net] = [ .mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()] //, .genet : Genet.init() //let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()] enum SupportModel: String{ - // case mobilenet = "mobilenet" + case mobilenet = "mobilenet" case mobilenet_ssd = "mobilenetssd" case genet = "genet" static func supportedModels() -> [SupportModel] { - //.mobilenet, - return [.mobilenet_ssd ,.genet] + // + return [.mobilenet, .mobilenet_ssd ,.genet] } } @@ -87,7 +87,7 @@ class ViewController: UIViewController { fatalError() } -// print(result.resultArray) + print(result.resultArray.strideArray()) if i == max - 1 { let time = Date.init().timeIntervalSince(startDate) DispatchQueue.main.async { diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj index 55dfe91ce0a700ff7c47cdff55b461908f1657ad..b2c24db200e1440ba4830d78a37100b72983d3e4 100644 --- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj @@ -41,7 +41,6 @@ FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; }; FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; }; FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; }; - FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1B186520ECF1C600678B91 /* ResizeKernel.swift */; }; FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; }; FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; }; FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; }; @@ -133,7 +132,6 @@ FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = ""; }; FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = ""; }; FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = ""; }; - FC1B186520ECF1C600678B91 /* ResizeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ResizeKernel.swift; sourceTree = ""; }; FC27990D21341016000B6BAD /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = ""; }; FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = ""; }; FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = ""; }; @@ -326,7 +324,6 @@ FCEB6837212F00B100D2448E /* metal */, FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */, FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */, - FC1B186520ECF1C600678B91 /* ResizeKernel.swift */, FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */, FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */, FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */, @@ -506,7 +503,6 @@ FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */, FC9D037920E229E4000F735A /* OpParam.swift in Sources */, FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */, - FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */, FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */, FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */, FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */, diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift index 2b7636ce00ba2ec56aabd7e732039973ee8efdb3..afbdccce5d54eff69d07ce7546679cf3781a27d2 100644 --- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift +++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift @@ -113,7 +113,7 @@ extension MTLDevice { return tensor } - func tensor2texture

(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> MTLTexture { + func tensor2texture

(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture { if value.count > 0 { assert(value.count == dim.reduce(1) { $0 * $1 }) } @@ -129,7 +129,13 @@ extension MTLDevice { textureDesc.height = ndim[1] textureDesc.depth = 1 textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.pixelFormat = .rgba32Float + + if inComputePrecision == .Float16 { + textureDesc.pixelFormat = .rgba16Float + } else if inComputePrecision == .Float32 { + textureDesc.pixelFormat = .rgba32Float + } + textureDesc.textureType = .type2DArray textureDesc.storageMode = .shared textureDesc.cpuCacheMode = .defaultCache @@ -354,13 +360,8 @@ public extension MTLTexture { } // n c h w - dim - func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { -// print("origin dim: \(dim)") - print("texture: ") - print(self) + func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] { var textureArray: [Float32] -// if texturePrecision == .Float16 - if pixelFormat == .rgba32Float { textureArray = floatArray { (i : Float32) -> Float32 in return i @@ -388,11 +389,10 @@ public extension MTLTexture { } } } - print(" tensor count -- \(output.count)") return output } - func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { + func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] { // print("origin dim: \(dim)") // print("texture: ") // print(self) diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift index 757c13404a80b695946d0daa6121b67d0be46d0e..0085cac22fb61ecb27f65beaef897d496dc9dd0c 100644 --- a/metal/paddle-mobile/paddle-mobile/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Executor.swift @@ -14,18 +14,18 @@ import Foundation -let testTo = 61 +let testTo = 161 var isTest = false -let computePrecision: ComputePrecision = .Float32 +let computePrecision: ComputePrecision = .Float16 -public class ResultHolder { +public class ResultHolder { public let dim: [Int] - public let resultArr: [P] + public let resultArr: [Float32] public var intermediateResults: [String : [Variant]]? public let elapsedTime: Double - public init(inDim: [Int], inResult: [P], inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) { + public init(inDim: [Int], inResult: [Float32], inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) { dim = inDim resultArr = inResult elapsedTime = inElapsedTime @@ -78,7 +78,7 @@ public class Executor { } } - public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (ResultHolder

) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws { + public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (ResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws { guard let buffer = queue.makeCommandBuffer() else { throw PaddleMobileError.predictError(message: "CommandBuffer is nil") } @@ -114,12 +114,10 @@ public class Executor { buffer.addCompletedHandler { (commandbuffer) in -// let inputArr = resInput.floatArray(res: { (p:P) -> P in -// return p -// }) -// print(inputArr.strideArray()) +// let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) +//// print(inputArr.strideArray()) // -// writeToLibrary(fileName: "genet_input_hand", array: inputArr) +// writeToLibrary(fileName: "test_image_ssd", array: inputArr) // print("write to library done") // return // print(inputArr) @@ -133,23 +131,23 @@ public class Executor { print(" 第 \(i) 个 op: ") op.delogOutput() } -// self.ops[59].delogOutput() + +// return; +// self.ops[testTo - 2].delogOutput() +// self.ops[testTo - 1].delogOutput() // self.ops[60].delogOutput() - return +// return let afterDate = Date.init() - - var resultHolder: ResultHolder

+ var resultHolder: ResultHolder if except > 0 { - resultHolder = ResultHolder

.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures) + resultHolder = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures) } else { let outputVar: Variant = self.program.scope.output()! let output: Texture

= outputVar as! Texture

- resultHolder = ResultHolder

.init(inDim: output.dim.dims, inResult: output.metalTexture.floatArray(res: { (p:P) -> P in - return p - }), inElapsedTime: afterDate.timeIntervalSince(beforeDate)) + resultHolder = ResultHolder.init(inDim: output.dim.dims, inResult: output.toTensor(), inElapsedTime: afterDate.timeIntervalSince(beforeDate)) } completionHandle(resultHolder) diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift index 4b5f91f9c771cd5d9031a548cdc1803fc417bbcc..68ce8c0691cd29a227758dc750315b45c305ed92 100644 --- a/metal/paddle-mobile/paddle-mobile/Loader.swift +++ b/metal/paddle-mobile/paddle-mobile/Loader.swift @@ -159,7 +159,7 @@ public class Loader { } catch let error { throw error } - tensor.convert(to: DataLayout.NHWC()) +// tensor.convert(to: DataLayout.NHWC()) // tensor.initBuffer(device: device) scope[varDesc.name] = tensor } else { @@ -168,7 +168,7 @@ public class Loader { } } else { if varDesc.name == fetchKey { - scope[varDesc.name] = ResultHolder

.init(inDim: [], inResult: [], inElapsedTime: 0.0) + scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: 0.0) } else if varDesc.name == feedKey { } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift index 193a271ccfc3ea3a68429f227394552c9f609f6f..d7c1aba2494479f30806f80f9ccefe943b5f7101 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift @@ -59,28 +59,28 @@ class BoxcoderOp: Operator, BoxcoderParam

func delogOutput() { print(" \(type) output: ") -// let priorBoxOriginDim = para.priorBox.originDim -// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3])) +// let priorBoxpadToFourDim = para.priorBox.padToFourDim +// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxpadToFourDim[0], h: priorBoxpadToFourDim[1], w: priorBoxpadToFourDim[2], c: priorBoxpadToFourDim[3])) // print(" prior box ") // print(priorBoxArray.strideArray()) // -// let priorBoxVarOriginDim = para.priorBoxVar.originDim -// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3])) +// let priorBoxVarpadToFourDim = para.priorBoxVar.padToFourDim +// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarpadToFourDim[0], h: priorBoxVarpadToFourDim[1], w: priorBoxVarpadToFourDim[2], c: priorBoxVarpadToFourDim[3])) // print(" prior box var ") // print(priorBoxVarArray.strideArray()) // -// let targetBoxOriginDim = para.targetBox.originDim -// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3])) +// let targetBoxpadToFourDim = para.targetBox.padToFourDim +// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3])) // print(" target box ") // print(targetBoxArray.strideArray()) - let targetBoxOriginDim = para.targetBox.originDim - let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]), texturePrecision: computePrecision) + let targetBoxpadToFourDim = para.targetBox.padToFourDim + let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3])) print(" target box ") print(targetBoxArray.strideArray()) - let originDim = para.output.originDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let padToFourDim = para.output.padToFourDim + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) print(" output ") print(outputArray.strideArray()) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift index 67e564dddd0aadcb059767380e7946040e8cd284..1abdb66aa7bdf89759a5987e3dde523c1f1dcf41 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift @@ -65,12 +65,12 @@ class ConcatOp: Operator, ConcatParam

>, Run func delogOutput() { print(" \(type) output: ") - let originDim = para.output.originDim + let padToFourDim = para.output.padToFourDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray()) } else { fatalError(" not implemet") } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift index 7bced214bd11bfef61eb405d59073f004e765e03..43935b65d1442d7c2e1ca3db49168140569c433f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift @@ -34,7 +34,7 @@ class ConvAddBatchNormReluParam: OpParam { scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) - y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.inputs, from: inScope) + y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) } catch let error { throw error } @@ -112,7 +112,7 @@ class ConvAddBatchNormReluOp: Operator: Operator, ConvAddParam

>, } func delogOutput() { +// print("op \(type): ") +// print(" padding: ") +// print(para.paddings) +// print("stride: ") +// print(para.stride) +// print("dilations: ") +// print(para.dilations) +// print(" para input dim: ") +// print(para.input.dim) +// print(" para filter dim: ") +// print(para.filter.dim) +// print(" para output dim: ") +// print(para.output.dim) +// print(" biase: ") +// let biase: [Float32] = para.y.buffer.array() +// print(biase) - print(" padding: ") - print(para.paddings) - print("stride: ") - print(para.stride) - print("dilations: ") - print(para.dilations) print(" \(type) output: ") - print(" para input dim: ") - print(para.input.dim) - print(" para filter dim: ") - print(para.filter.dim) - print(" para output dim: ") - print(para.output.dim) - print(" biase: ") - let biase: [Float32] = para.y.buffer.array() - print(biase) - - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift index be8c57d3ace01dabd652e0e80a43c5a053213e28..959fe44b98dabec2b39fdfdb438d482d720caa61 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift @@ -110,7 +110,7 @@ class ConvBNReluOp: Operator, ConvBNReluPa func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift index 98f4a077671bbc3161f147bf73c9691dd1f86536..e82eb1f4753f0ebfdb5a949c85181a0ae52ea2da 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift @@ -75,7 +75,7 @@ class ConvOp: Operator, ConvParam

>, Runable, func delogOutput() { print("conv output : ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) + print(para.output.toTensor().strideArray()) // let _: Float16? = para.output.metalTexture.logDesc() } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift index 387fa420b68f8004a12af85ca398cf306f41a5c6..e1b62c24f4bec7104ba7489b56460884acc0cc21 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift @@ -43,13 +43,15 @@ class ConvTransposeOp: Operator, ConvTr } func delogOutput() { + print(" \(type) output: ") - let originDim = para.output.originDim + let padToFourDim = para.output.padToFourDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])) + print(output.strideArray()) } else { print(" not implement") } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift index 36f477bc1cb48007b5b28bf27a7424940918025b..ec76eecf1fc9736d9dff6a4cf0d69a314a9b1e0d 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift @@ -58,6 +58,6 @@ class DepthConvOp: Operator, ConvParam

>, Runa func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift index 0ea8a62c5c0bf30da200add2a96410136d2f40fb..8575cfd88c7ddea2f007cad21507b4620c87d3e2 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift @@ -65,6 +65,6 @@ class DwConvBNReluOp: Operator, ConvBNRelu func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift index 0f96b204d59f3d4a0dd0fae20340811855421c95..7fac84812b115f28b391d5dfe29a4aed0fd46969 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift @@ -71,12 +71,15 @@ class ElementwiseAddOp: Operator, Elem // print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) print(" \(type) output: ") - let originDim = para.output.originDim + + print(para.inputY) + + let padToFourDim = para.output.padToFourDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) } else { print(" not implement") } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift index b6075a807d1637c554587ea99724d6ff1f38e7e6..382ea58b844b25bb855ed7cdc155a860bca45da5 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift @@ -61,7 +61,7 @@ class FeedOp: Operator, FeedParam< func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift index bae452dec331957ceda5a6f503802352f63a6dbe..bf0a2959fa9627f5cbe994dcd183f9ed7f04456c 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift @@ -15,53 +15,60 @@ import Foundation class BatchNormKernel: Kernel, Computable { - var newScale: MTLBuffer - var newBias: MTLBuffer + var newScale: MTLBuffer + var newBias: MTLBuffer + + required init(device: MTLDevice, param: BatchNormParam

) { + guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else { + fatalError() + } + + guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else { + fatalError() + } + self.newScale = newScale + self.newBias = newBias + + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "batchnorm") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "batchnorm_half") + } else { + fatalError() + } + + let varianceBuffer : MTLBuffer = param.inputVariance.buffer - required init(device: MTLDevice, param: BatchNormParam

) { - guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else { - fatalError() - } - guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else { - fatalError() - } - self.newScale = newScale - self.newBias = newBias - - super.init(device: device, inFunctionName: "batchnorm") - - let varianceBuffer : MTLBuffer = param.inputVariance.buffer - - var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length) - let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self) - for i in 0..<(varianceBuffer.length / MemoryLayout

.stride) { - invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot() - } - - let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self) - let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self) - let scale : MTLBuffer = param.inputScale.buffer - let scaleContents = scale.contents().assumingMemoryBound(to: P.self) - let bias : MTLBuffer = param.inputBias.buffer - let biasContents = bias.contents().assumingMemoryBound(to: P.self) - let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self) - - for i in 0..<(newScale.length / MemoryLayout

.stride) { - newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i])) - newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i])) - } + var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length) + let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self) + for i in 0..<(varianceBuffer.length / MemoryLayout

.stride) { + invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot() } - func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") - } - print("BatchNorm compute") - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBuffer(newScale, offset: 0, index: 0) - encoder.setBuffer(newBias, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() + let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self) + let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self) + let scale : MTLBuffer = param.inputScale.buffer + let scaleContents = scale.contents().assumingMemoryBound(to: P.self) + let bias : MTLBuffer = param.inputBias.buffer + let biasContents = bias.contents().assumingMemoryBound(to: P.self) + let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self) + + for i in 0..<(newScale.length / MemoryLayout

.stride) { + newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i])) + newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i])) + } + } + + func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") } + print("BatchNorm compute") + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBuffer(newScale, offset: 0, index: 0) + encoder.setBuffer(newBias, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift index bcdcc732b08e5f578bf1b35eb754cf221217144c..66324dd47086fd7c1ccffb674c0f8b8623416e0d 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -49,26 +49,37 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvAddBatchNormReluParam

) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) - - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") - } else { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") - } - param.filter.initBuffer(device: device, precision: computePrecision) - param.y.initBuffer(device: device, precision: computePrecision) - param.variance.initBuffer(device: device, precision: .Float32) param.mean.initBuffer(device: device, precision: .Float32) param.scale.initBuffer(device: device, precision: .Float32) param.bias.initBuffer(device: device, precision: .Float32) + if computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") + } else { + fatalError(" unsupport ") + } + } else if computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half") + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half") + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetY = param.filter.height/2 - Int(param.paddings[1]) @@ -108,10 +119,10 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable var newBiaseBuffer: MTLBuffer var newScaleBuffer: MTLBuffer - if computePrecision == .Float16 { + if computePrecision == .Float32 { newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! - } else if computePrecision == .Float32 { + } else if computePrecision == .Float16 { newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! @@ -138,7 +149,6 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable throw PaddleMobileError.predictError(message: " encode is nil") } - encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.output.metalTexture, index: 1) encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift index f2ad60fcfc8ec77e40a21a130026fcb8fa290621..d2b54d9be60694871b4d4e3ba4bd3852bc94e875 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift @@ -17,14 +17,23 @@ import Foundation class ConvAddKernel: Kernel, Computable { var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvAddParam

) { - + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) + param.filter.initBuffer(device: device, precision: computePrecision) + param.y.initBuffer(device: device, precision: computePrecision) + if computePrecision == .Float16 { if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_add_1x1_half") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_add_3x3_half") + } else if param.filter.width == 1 && param.filter.height == 5 { + super.init(device: device, inFunctionName: "conv_add_5x1_half") + } else if param.filter.width == 5 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x5_half") + } else { + fatalError(" unsupport yet ") } } else if computePrecision == .Float32 { if param.filter.width == 1 && param.filter.height == 1 { @@ -35,22 +44,21 @@ class ConvAddKernel: Kernel, Computable { super.init(device: device, inFunctionName: "conv_add_5x1") } else if param.filter.width == 5 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_add_1x5") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_add_3x3") + } else { + fatalError(" unsupport yet ") } } else { fatalError() } + + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) - - param.filter.initBuffer(device: device, precision: computePrecision) - param.y.initBuffer(device: device, precision: computePrecision) - print(" function: \(functionName)") print("offset x: \(offsetX)") print("offset y: \(offsetY)") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift index 4b978e24d2b968d4ee2ee8443d74ce54f502be8b..bbf4eeaaf41e321f2c7c469011f1a243ef035107 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift @@ -49,35 +49,41 @@ class ConvBNReluKernel: Kernel, Computable, Testable { } var metalParam: MetalConvParam! - + required init(device: MTLDevice, param: ConvBNReluParam

) { + + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) + param.filter.initBuffer(device: device, precision: computePrecision) + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) + if computePrecision == .Float32 { if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3") + } else { + fatalError(" unsupport ") } } else if computePrecision == .Float16 { if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half") + } else { + fatalError(" unsupport ") } } else { fatalError() } - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) - param.filter.initBuffer(device: device, precision: computePrecision) - - param.variance.initBuffer(device: device, precision: .Float32) - param.mean.initBuffer(device: device, precision: .Float32) - param.scale.initBuffer(device: device, precision: .Float32) - param.bias.initBuffer(device: device, precision: .Float32) + let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetY = param.filter.height/2 - Int(param.paddings[1]) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift index 6c10ba8d18f3c0e386769c11867e8bb361f49b21..345136a503d8eda6ad23f85ef01eb53fa539d453 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift @@ -27,18 +27,20 @@ public struct MetalConvParam { class ConvKernel: Kernel, Computable { var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvParam

) { + param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_1x1") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_3x3") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_3x3") + } else { + fatalError(" unsupport ") } - + let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let offsetZ = 0.0 - param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift index 9354972d71c776392e9b7bf621e3e4de7233ffe8..435776c850854f2fc4259e8a2089299da825f463 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift @@ -31,7 +31,27 @@ struct MetalConvTransposeParam { class ConvTransposeKernel: Kernel, Computable{ var metalParam: MetalConvTransposeParam! required init(device: MTLDevice, param: ConvTransposeParam

) { - super.init(device: device, inFunctionName: "conv_transpose") + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) + param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true) + if computePrecision == .Float32 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2") + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else if computePrecision == .Float16 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half") + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else { + fatalError() + } + +// let filter: [Float32] = param.filter.buffer.array() +// print(" conv transpose filter") +// print(filter) let kernelWidth = UInt16(param.filter.width) let kernelHeight = UInt16(param.filter.height) @@ -43,9 +63,7 @@ class ConvTransposeKernel: Kernel, Computable{ let dilationY = UInt16(param.dilations[1]) metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) - - param.output.initTexture(device: device, inTranspose: param.input.transpose) - param.filter.initBuffer(device: device) + } func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift index e8e9dffc4562f0fc411e81e25f245b09ce292b0b..a4c88016b3a442d8d6937214755db9e33e9cb28f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift @@ -15,6 +15,7 @@ import Foundation struct ElementwiseAddMetalParam { + var unsafe_one_dim: Int32 = 0 var fast: Int32 = 0 var axis: Int32 = 0 var yoff: Int32 = 0 @@ -26,8 +27,14 @@ struct ElementwiseAddMetalParam { class ElementwiseAddKernel: Kernel, Computable { required init(device: MTLDevice, param: ElementwiseAddParam

) { - super.init(device: device, inFunctionName: "elementwise_add") param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "elementwise_add") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "elementwise_add_half") + } else { + fatalError() + } } func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam

) throws { @@ -59,6 +66,11 @@ class ElementwiseAddKernel: Kernel, Computable { emp.fast = 1 } + // TODO: + if param.inputY.tensorDim.cout() == 1 { + emp.unsafe_one_dim = 1; + } + encoder.setBytes(&emp, length: MemoryLayout.size, index: 0) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.endEncoding() diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift index b6db7231e83943dbce6f2cbe3266af9fbe508aef..59754e57c112ebf5e300bd0ef98465e08744e4bf 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift @@ -27,8 +27,14 @@ struct PoolMetalParam { class PoolKernel: Kernel, Computable{ required init(device: MTLDevice, param: PoolParam

) { - super.init(device: device, inFunctionName: "pool") param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "pool") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "pool_half") + } else { + fatalError() + } } func compute(commandBuffer: MTLCommandBuffer, param: PoolParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift index 1545a848dacb4f11a2a68df31f7ea49a23799a87..32505faf0faeab97586352d60535d05e86e59631 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift @@ -10,15 +10,27 @@ import Foundation class PreluKernel: Kernel, Computable{ required init(device: MTLDevice, param: PreluParam

) { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "prelu_channel") - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "prelu_element") - } else { - super.init(device: device, inFunctionName: "prelu_other") - } param.alpha.initBuffer(device: device, precision: computePrecision) param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) + if computePrecision == .Float32 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel") + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element") + } else { + super.init(device: device, inFunctionName: "prelu_other") + } + } else if computePrecision == .Float16 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel_half") + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element_half") + } else { + super.init(device: device, inFunctionName: "prelu_other_half") + } + } else { + fatalError() + } } func compute(commandBuffer: MTLCommandBuffer, param: PreluParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift index ece3e3915dd130c48ef717f29cd95a13eca52d0c..426e91cfbe26385a7b30931d155513633d2bc988 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift @@ -33,6 +33,10 @@ class PriorBoxKernel: Kernel, Computable{ var metalParam: PriorBoxMetalParam! required init(device: MTLDevice, param: PriorBoxParam

) { + + param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) + param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) + if computePrecision == .Float32 { super.init(device: device, inFunctionName: "prior_box") } else if computePrecision == .Float16 { @@ -41,9 +45,6 @@ class PriorBoxKernel: Kernel, Computable{ fatalError() } - param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) - param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) - let n = 1 let h = param.output.dim[1] let w = param.output.dim[2] @@ -52,11 +53,11 @@ class PriorBoxKernel: Kernel, Computable{ param.output.dim = Dim.init(inDim: [n, h, w, c]) param.output.transpose = [0, 1, 2, 3] - let imageWidth = Float32(param.inputImage.originDim[3]) - let imageHeight = Float32(param.inputImage.originDim[2]) + let imageWidth = Float32(param.inputImage.padToFourDim[3]) + let imageHeight = Float32(param.inputImage.padToFourDim[2]) - let featureWidth = param.input.originDim[3] - let featureHeight = param.input.originDim[2] + let featureWidth = param.input.padToFourDim[3] + let featureHeight = param.input.padToFourDim[2] if param.stepW == 0 || param.stepH == 0 { param.stepW = Float32(imageWidth) / Float32(featureWidth) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift index 3c669cf4d965f7842070c4d38427f6d1d7440db5..18f279e9f3c5226d6eea5b5e6f0a42502173071e 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift @@ -15,17 +15,23 @@ import Foundation class ReluKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() + func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") } - - required init(device: MTLDevice, param: ReluParam

) { - super.init(device: device, inFunctionName: "relu") + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } + + required init(device: MTLDevice, param: ReluParam

) { + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "relu") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "relu_half") + } else { + fatalError() } + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift deleted file mode 100644 index d2795111ad1f43c759b95aa52ed34085a4ac147a..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -// -//import Foundation -//import MetalPerformanceShaders -// -// -//struct ResizeParam: OpParam{ -// typealias OutputType = <#type#> -// -// typealias ParamPrecisionType = <#type#> -// -// let input: MTLTexture -// let output: MTLTexture -// let expectDim: Dim -//} -// -//struct OutputDim { -// let width: UInt16 -// let height: UInt16 -// let strideX: UInt16 -// let strideY: UInt16 -//} -// -//class ResizeKernel: Kernel, Computable{ -// var lanczos: MPSImageLanczosScale -// required init(device: MTLDevice, param: ResizeParam) { -// lanczos = MPSImageLanczosScale.init(device: device) -// super.init(device: device, inFunctionName: "resize") -// } -// func compute(commandBuffer: MTLCommandBuffer, param: ResizeParam) throws { -//// guard let encoder = commandBuffer.makeComputeCommandEncoder() else { -//// throw PaddleMobileError.predictError(message: " encode is nil") -//// } -// lanczos.encode(commandBuffer: commandBuffer, sourceTexture: param.input, destinationTexture: param.output) -// -//// encoder.setTexture(param.input, index: 0) -//// encoder.setTexture(param.output, index: 1) -//// let strideX = param.input.width/param.expectDim[2] -//// let strideY = param.input.height/param.expectDim[1] -//// var outputDim = OutputDim.init(width: UInt16(param.expectDim[1]), height: UInt16(param.expectDim[2]), strideX: UInt16(strideX), strideY: UInt16(strideY)) -//// encoder.setBytes(&outputDim, length: MemoryLayout.size, index: 0) -//// encoder.dispatch(computePipline: pipline, outTexture: param.output) -//// encoder.endEncoding() -// } -// -// -// -// -//} - diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift index 6f6d0af477f62d7f438b8b6a38c825c2eb95163f..0c166c3563149d60dc8cbee451ef23e6a1fb9b93 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift @@ -21,6 +21,17 @@ struct SoftmaxMetalParam { class SoftmaxKernel: Kernel, Computable{ + required init(device: MTLDevice, param: SoftmaxParam

) { + param.output.initTexture(device: device, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "softmax") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "softmax_half") + } else { + fatalError() + } + } + func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam

) throws { guard let encoder = commandBuffer.makeComputeCommandEncoder() else { throw PaddleMobileError.predictError(message: " encoder is nil") @@ -32,19 +43,12 @@ class SoftmaxKernel: Kernel, Computable{ N: Int32(param.input.tensorDim[0]), K: Int32(param.input.tensorDim[1]) ) + + print(" soft max param: ") + print(smp) encoder.setBytes(&smp, length: MemoryLayout.size, index: 0) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.endEncoding() } - required init(device: MTLDevice, param: SoftmaxParam

) { - param.output.initTexture(device: device, computePrecision: computePrecision) - if computePrecision == .Float32 { - super.init(device: device, inFunctionName: "softmax") - } else if computePrecision == .Float16 { - super.init(device: device, inFunctionName: "softmax_half") - } else { - fatalError() - } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal index 3cb71ca21b425a76a6395351531ea7e81edcbbf8..0159dcf3e008ed4fe5bfd083674084fb1fb9e978 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal @@ -429,7 +429,122 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array in } +kernel void conv_add_5x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_y = param.dilationY; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_add_1x5_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_x = param.dilationX; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} kernel void test_conv_add_3x3(texture2d_array inTexture [[texture(0)]], @@ -502,3 +617,6 @@ kernel void test_conv_add_3x3(texture2d_array inTexture [ // output = output + biase[gid.z]; outTexture.write(output, gid.xy, gid.z); } + + + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal index 35c2c56cace5975c5fe166b007bb695eba163325..c07515c13da54c7f8bf698f976e47f7cda6de32b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal @@ -148,4 +148,133 @@ kernel void conv_1x1(texture2d_array inTexture [[texture( } +kernel void conv_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); + output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); + output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); + output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); + } + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(float4(input), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(float4(input), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(float4(input), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(float4(input), float4(weight_w)); + } + outTexture.write(half4(output), gid.xy, gid.z); +} + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal index 5c5a499fceae9061dba30307e2c4ecd3b0e89164..baf3f31157a472412bb08ccb3c803f5ec9e25d9c 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal @@ -29,11 +29,11 @@ struct MetalConvTransposeParam{ ushort dilationY; }; -kernel void conv_transpose(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvTransposeParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]){ +kernel void conv_transpose2x2_stride2(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvTransposeParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || gid.z >= outTexture.get_array_size()) { @@ -41,48 +41,134 @@ kernel void conv_transpose(texture2d_array inTexture [[te } int input_array_size = inTexture.get_array_size(); - - uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH; - - uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { + + float4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(input, kernel_slice0); + + output.y += dot(input, kernel_slice1); + + output.z += dot(input, kernel_slice2); + + output.w += dot(input, kernel_slice3); + } + + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_transpose2x2_stride2_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvTransposeParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } - float4 output; + int input_array_size = inTexture.get_array_size(); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; - for (int w = 0; w < param.kernelW; ++w) { - int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX; - if (input_x < 0 || input_x >= int(inTexture.get_width())) { - continue; - } + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { - for (int h = 0; h < param.kernelH; ++h) { - int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY; - if (input_y < 0 || input_y >= int(inTexture.get_height())) { - continue; - } - - uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size(); - - for (int slice = 0; slice < input_array_size; ++slice) { - - float4 input; - float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice]; - float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice]; - - float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice]; - - float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice]; - - input = inTexture.sample(sample, float2(input_x, input_x), slice); - output.x += dot(input, kernel_slice); - output.x += dot(input, kernel_slice1); - output.x += dot(input, kernel_slice2); - output.x += dot(input, kernel_slice3); - } - } + half4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(float4(input), float4(kernel_slice0)); + + output.y += dot(float4(input), float4(kernel_slice1)); + + output.z += dot(float4(input), float4(kernel_slice2)); + + output.w += dot(float4(input), float4(kernel_slice3)); } - outTexture.write(output, gid.xy, gid.z); + outTexture.write(half4(output), gid.xy, gid.z); } +//kernel void conv_transpose(texture2d_array inTexture [[texture(0)]], +// texture2d_array outTexture [[texture(1)]], +// constant MetalConvTransposeParam ¶m [[buffer(0)]], +// const device float4 *weights [[buffer(1)]], +// uint3 gid [[thread_position_in_grid]]){ +// if (gid.x >= outTexture.get_width() || +// gid.y >= outTexture.get_height() || +// gid.z >= outTexture.get_array_size()) { +// return; +// } +// +// int input_array_size = inTexture.get_array_size(); +// +// uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH; +// +// uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice); +// +// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); +// +// float4 output; +// +// for (int w = 0; w < param.kernelW; ++w) { +// int top = gid.x - w * param.dilationX + param.paddingX; +// int input_x = top / param.strideX; +// if (top < 0 || input_x >= int(inTexture.get_width())) { +// continue; +// } +// +// for (int h = 0; h < param.kernelH; ++h) { +// int top_y = gid.y - h * param.dilationY + param.paddingY; +// int input_y = top_y / param.strideY; +// if (top_y < 0 || input_y >= int(inTexture.get_height())) { +// continue; +// } +// +// uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size(); +// +// for (int slice = 0; slice < input_array_size; ++slice) { +// +// float4 input; +// float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice]; +// float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice]; +// +// float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice]; +// +// float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice]; +// +// input = inTexture.sample(sample, float2(input_x, input_y), slice); +// output.x += dot(input, kernel_slice); +// output.y += dot(input, kernel_slice1); +// output.z += dot(input, kernel_slice2); +// output.w += dot(input, kernel_slice3); +// } +// } +// } +// +// outTexture.write(output, gid.xy, gid.z); +//} +// diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal index 5ad53a8ef94adde2de6034727b49415ba2cd17c7..cc662786de91a0c7d62aca4b0124ee68990d797e 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal @@ -18,6 +18,7 @@ using namespace metal; struct ElementwiseAddParam { + int32_t unsafe_one_dim; int32_t fast; int32_t axis; int32_t yoff; @@ -36,7 +37,10 @@ kernel void elementwise_add(texture2d_array inputX [[textur gid.y >= outTexture.get_height() || gid.z >= outTexture.get_array_size()) return; float4 rx, ry; - if (pm.fast == 1) { + if (pm.unsafe_one_dim == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(uint2(0, 0), gid.z); + } else if (pm.fast == 1) { rx = inputX.read(gid.xy, gid.z); ry = inputY.read(gid.xy, gid.z); } else { @@ -59,3 +63,39 @@ kernel void elementwise_add(texture2d_array inputX [[textur float4 r = rx + ry; outTexture.write(r, gid.xy, gid.z); } + +kernel void elementwise_add_half(texture2d_array inputX [[texture(0)]], + texture2d_array inputY [[texture(1)]], + texture2d_array outTexture [[texture(2)]], + constant ElementwiseAddParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + half4 rx, ry; + if (pm.unsafe_one_dim == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(uint2(0, 0), gid.z); + } else if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); + } else { + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {1, 1, 1, 1}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + for (int n = 0; n < 4; n++) { + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (4 - pm.yoff); k++) { + y_abcd[k+pm.yoff] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + half4 r = rx + ry; + outTexture.write(r, gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal index bd14a146100d0a0723f73ab5fd1f95d1f8e39c97..597804137743dd253d05d91a5008f558dcaf42e7 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal @@ -81,3 +81,71 @@ kernel void prelu_other(texture2d_array inTexture [[textu outTexture.write(output, gid.xy, gid.z); } + +kernel void prelu_channel_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half4 alpha_value = alpha[gid.z]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_element_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + + int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); + half4 alpha_value = alpha[alpha_to + gid.z]; + + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_other_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half alpha_value = alpha[0]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value * input.x); + output.y = input.y > 0 ? input.y : (alpha_value * input.y); + output.z = input.z > 0 ? input.z : (alpha_value * input.z); + output.w = input.w > 0 ? input.w : (alpha_value * input.w); + outTexture.write(output, gid.xy, gid.z); +} + + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift index d3d31cfcd43a81f6068238d45f3442af0fc7795f..6f42f2aa9f8d0515946ace625ed16c5040fd3099 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift @@ -60,7 +60,7 @@ class PoolOp: Operator, PoolParam

>, Runable, func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) // print("pool2d delog") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift index c7e049e3c1b21d9747acca8812abfff8c25d6d98..52cab3425f91e9fdfa2f5932d32eb7cc3947f6ab 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift @@ -51,13 +51,13 @@ class PreluOp: Operator, PreluParam

>, Runabl func delogOutput() { print(" \(type) input: ") - print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray()) print(" \(type) Alpha: ") let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) } // print("softmax delog") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift index d48fc4cd1cc4bfef9d752ea999cb39b61cb02941..4a27fba9839af5feb709a69a76529c60928a981e 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift @@ -76,12 +76,12 @@ class PriorBoxOp: Operator, PriorBoxParam

print(outputArray) // output // print(" \(type) output: ") -// let originDim = para.output.originDim +// let padToFourDim = para.output.padToFourDim // if para.output.transpose == [0, 1, 2, 3] { -// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) +// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision) // print(outputArray.strideArray()) // } else if para.output.transpose == [0, 2, 3, 1] { -// print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) +// print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray()) // } else { // print(" not implement") // } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift index 0325f860e078cf639c08e279970a105e3f562a32..c9f054c88af44ac3f5dd453b4696c7988d01fa8f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift @@ -46,7 +46,7 @@ class ReluOp: Operator, ReluParam

>, Runable, func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift index 451b064ce19e0e1cb70700d046b6ab059e6df9e3..b37026d6a81c6eed4493953a84afc05d4a6b980f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift @@ -41,8 +41,8 @@ class ReshapeParam: OpParam { for i in 0..: Operator, ReshapeParam

>, print("reshape delog") // let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false) - let originDim = para.output.originDim + let padToFourDim = para.output.padToFourDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) print(outputArray.strideArray()) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift index af776450d7f15d031a6af546d13bc1c0374249b7..66b5c7b3146d4c433e12b846a971e4b5ae579f79 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift @@ -26,7 +26,7 @@ class SoftmaxParam: OpParam { output.dim = input.dim output.tensorDim = input.tensorDim - output.originDim = input.originDim + output.padToFourDim = input.padToFourDim } catch let error { throw error } @@ -52,9 +52,11 @@ class SoftmaxOp: Operator, SoftmaxParam

>, func delogOutput() { print("softmax delog") - - let originDim = para.output.originDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + print(para.input) + + print(para.output) + let padToFourDim = para.output.padToFourDim + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) print(outputArray.strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift index 8281ba543365410f49eaad1f4d992f54af30531d..0213b52bf16fa498835729c5b7e3a65600f7669d 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift @@ -48,9 +48,9 @@ class TransposeOp: Operator, TransposeParam func delogOutput() { print(" \(type) output: ") - let originDim = para.output.originDim + let padToFourDim = para.output.padToFourDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + let outputArray = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift index a318180b2b57b162715f0088fdfd88767506ad2a..c5ee1414521e7eb92011d4f4b608ad326b005531 100644 --- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift +++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift @@ -95,7 +95,28 @@ class Tensor: Tensorial { - func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16) { + func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) { + if convertToNHWC { +// print(layout) + convert(to: DataLayout.NHWC()) + } + + if withTranspose { + let transposePointer = UnsafeMutablePointer

.allocate(capacity: numel()) + let n = dim[0] + let hwc = numel()/n + for j in 0.. else { fatalError(" not support yet ") } @@ -139,6 +160,8 @@ class Tensor: Tensorial { for j in 0..: Tensorial { float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) } + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() + } + } else { + let C = dim[3] + let cSlices = (C + 3) / 4 + let paddedC = cSlices * 4 + let count = paddedC * dim[0] * dim[1] * dim[2] + if C == paddedC { + buffer = device.makeBuffer(length: count * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) + } + } else if C == 1 { + fatalError(" not support ") + } else { + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) + var tmpPointer = floatPointer + var dstPtr = convertedPointer + for _ in 0...stride) + case .Float16: + float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) + } convertedPointer.deinitialize(count: count) convertedPointer.deallocate() } diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift index 42a381c50cd3a66f39d9e19e9cffc8de83ca48b2..d3beb92a5fa30dfae5ddbbcd8bb7563b791771e4 100644 --- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift +++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift @@ -41,14 +41,28 @@ extension InputTexture { public class Texture: Tensorial { var dim: Dim public var tensorDim: Dim - public var originDim: Dim + public var padToFourDim: Dim private var textureDesc: MTLTextureDescriptor! public var metalTexture: MTLTexture! var transpose: [Int] = [0, 1, 2, 3] + func toTensor() -> [Float32] { + guard padToFourDim.cout() == 4 else { + fatalError("- not support -") + } + return metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])) + } + + func realNHWC() -> [Float32] { + guard padToFourDim.cout() == 4 else { + fatalError(" - not support - ") + } + return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + } + func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) { transpose = inTranspose - let newDim = transpose.map { originDim[$0] } + let newDim = transpose.map { padToFourDim[$0] } let newLayout = transpose.map { layout.layoutWithDim[$0] } @@ -93,7 +107,7 @@ public class Texture: Tensorial { } tensorDim = inDim dim = fourDim - originDim = fourDim + padToFourDim = fourDim layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])]) }