From 6ec031ffb208a60065d89b44452014a6b8567604 Mon Sep 17 00:00:00 2001 From: liuruilong Date: Thu, 30 Aug 2018 20:14:18 +0800 Subject: [PATCH] add dilation --- .../paddle-mobile-demo/ViewController.swift | 5 +- .../paddle-mobile.xcodeproj/project.pbxproj | 2 + .../paddle-mobile/Common/MetalExtension.swift | 2 +- .../Common/PaddleMobileUnitTest.swift | 2 +- .../paddle-mobile/Executor.swift | 19 +- .../paddle-mobile/paddle-mobile/Loader.swift | 314 +++++---- .../paddle-mobile/Operators/ConvAddOp.swift | 7 + .../Operators/ConvTransposeOp.swift | 13 +- .../Operators/ElementwiseAddOp.swift | 13 +- .../Kernels/ConvAddBatchNormReluKernel.swift | 2 +- .../Operators/Kernels/ConvAddKernel.swift | 11 +- .../Operators/Kernels/ConvBNReluKernel.swift | 2 +- .../Operators/Kernels/ConvKernel.swift | 4 +- .../Kernels/ConvTransposeKernel.swift | 1 + .../Kernels/ElementwiseAddKernel.swift | 2 +- .../Operators/Kernels/metal/ConvKernel.metal | 665 +++++++++--------- .../paddle-mobile/Program/BlockDesc.swift | 6 +- .../paddle-mobile/framework/Tensor.swift | 421 +++++------ 18 files changed, 765 insertions(+), 726 deletions(-) diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift index 1674546fe5..e2351d14e3 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift @@ -26,7 +26,7 @@ let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand. enum SupportModel: String{ // case mobilenet = "mobilenet" case mobilenet_ssd = "mobilenetssd" - case genet = "enet" + case genet = "genet" static func supportedModels() -> [SupportModel] { //.mobilenet, return [.mobilenet_ssd ,.genet] @@ -79,7 +79,7 @@ class ViewController: UIViewController { return } do { - let max = 1 + let max = 10 let startDate = Date.init() for i in 0.. [Float32] { - print("origin dim: \(dim)") +// print("origin dim: \(dim)") print("texture: ") print(self) diff --git a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift index 0d1fd39a34..2764146929 100644 --- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift +++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift @@ -314,7 +314,7 @@ public class PaddleMobileUnitTest { let offsetX = filterSize.width/2 - paddings.0 let offsetY = filterSize.height/2 - paddings.1 - let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0)) + let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0), dilationX: UInt16(1), dilationY: UInt16(1)) let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift index f569b74c02..47c91de096 100644 --- a/metal/paddle-mobile/paddle-mobile/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Executor.swift @@ -14,7 +14,7 @@ import Foundation -let testTo = 12 +let testTo = 54 public class ResultHolder { public let dim: [Int] @@ -62,7 +62,7 @@ public class Executor { queue = inQueue for block in inProgram.programDesc.blocks { //block.ops.count - for i in 0...shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope) @@ -124,13 +124,13 @@ public class Executor { // print(stridableInput) // let _: Flo? = input.logDesc(header: "input: ", stridable: true) - for i in 0.. { return p }), inElapsedTime: afterDate.timeIntervalSince(beforeDate)) } - completionHandle(resultHolder) } diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift index 5c2733a5d9..4b5f91f9c7 100644 --- a/metal/paddle-mobile/paddle-mobile/Loader.swift +++ b/metal/paddle-mobile/paddle-mobile/Loader.swift @@ -16,168 +16,170 @@ import Foundation import SwiftProtobuf public class Loader { - class ParaLoader { - let file: UnsafeMutablePointer - let fileSize: Int - var nowIndex: Int - init(paramPath: String) throws { - guard let tmpFile = fopen(paramPath, "rb") else { - throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) - } - file = tmpFile - fseek(file, 0, SEEK_END) - fileSize = ftell(file) - guard fileSize > 0 else { - throw PaddleMobileError.loaderError(message: "param file size is too small") - } - rewind(file) - nowIndex = 0 - } - - func read(tensor: Tensor

) throws { - guard nowIndex <= fileSize else { - throw PaddleMobileError.loaderError(message: "out of the file range") - } - - func pointerReader(type: T.Type) -> T { - let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) - fread(ptr, 1, MemoryLayout.size, file) - nowIndex += MemoryLayout.size - let pointee = ptr.pointee - ptr.deinitialize(count: MemoryLayout.size) - ptr.deallocate() - return pointee - } - - let _ = pointerReader(type: UInt32.self) - let lodLevel = pointerReader(type: UInt64.self) - for _ in 0...size)){ - _ = pointerReader(type: size_t.self) - } - } - - let _ = pointerReader(type: UInt32.self) - - let tensorDescSize = pointerReader(type: Int32.self) - - fseek(file, Int(tensorDescSize), SEEK_CUR) - nowIndex += Int(tensorDescSize) - - /* - 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 - */ - - //现在模型传入模型为 Float 类型, 这块应该根据模型来 -// let tmpCapacity = MemoryLayout.size * tensor.numel() -// let tmpPointer = UnsafeMutablePointer.allocate(capacity: tmpCapacity); - let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file) - - guard bytesRead == tensor.data.size else { - throw PaddleMobileError.loaderError(message: "param read size error") - } - - // TODO: use script to convert -// let bytesRead = fread(tmpPointer, 1, tmpCapacity, file) -// for i in 0.. + let fileSize: Int + var nowIndex: Int + init(paramPath: String) throws { + guard let tmpFile = fopen(paramPath, "rb") else { + throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) + } + file = tmpFile + fseek(file, 0, SEEK_END) + fileSize = ftell(file) + guard fileSize > 0 else { + throw PaddleMobileError.loaderError(message: "param file size is too small") + } + rewind(file) + nowIndex = 0 } - public init(){} - public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{ - guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else { - throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !") + + func read(tensor: Tensor

) throws { + guard nowIndex <= fileSize else { + throw PaddleMobileError.loaderError(message: "out of the file range") + } + + func pointerReader(type: T.Type) -> T { + let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) + fread(ptr, 1, MemoryLayout.size, file) + nowIndex += MemoryLayout.size + let pointee = ptr.pointee + ptr.deinitialize(count: MemoryLayout.size) + ptr.deallocate() + return pointee + } + + let _ = pointerReader(type: UInt32.self) + let lodLevel = pointerReader(type: UInt64.self) + for _ in 0...size)){ + _ = pointerReader(type: size_t.self) } - - do { - let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init( - serializedData: modelData) - - let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram) - let programDesc = ProgramOptimize

.init().optimize(originProgramDesc: originProgramDesc) - print(programDesc) - - guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else { - throw PaddleMobileError.loaderError(message: "load para error") - } - - guard programDesc.blocks.count > 0 else { - throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0") - } - - // to get feed key and fetch key - let block = programDesc.blocks[0] - guard let firstOp = block.ops.first, let lastOp = block.ops.last else { - throw PaddleMobileError.loaderError(message: "at least two operator") - } - guard firstOp.type == gFeedType, lastOp.type == gFetchType else { - throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch") + } + + let _ = pointerReader(type: UInt32.self) + + let tensorDescSize = pointerReader(type: Int32.self) + + fseek(file, Int(tensorDescSize), SEEK_CUR) + nowIndex += Int(tensorDescSize) + + /* + 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 + */ + + //现在模型传入模型为 Float 类型, 这块应该根据模型来 + // let tmpCapacity = MemoryLayout.size * tensor.numel() + // let tmpPointer = UnsafeMutablePointer.allocate(capacity: tmpCapacity); + let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file) + + guard bytesRead == tensor.data.size else { + throw PaddleMobileError.loaderError(message: "param read size error") + } + + // TODO: use script to convert + // let bytesRead = fread(tmpPointer, 1, tmpCapacity, file) + // for i in 0.. Program{ + + guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else { + throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !") + } + + do { + let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init( + serializedData: modelData) + + let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram) + let programDesc = ProgramOptimize

.init().optimize(originProgramDesc: originProgramDesc) + print(programDesc) + + guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else { + throw PaddleMobileError.loaderError(message: "load para error") + } + + guard programDesc.blocks.count > 0 else { + throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0") + } + + // to get feed key and fetch key + let block = programDesc.blocks[0] + guard let firstOp = block.ops.first, let lastOp = block.ops.last else { + throw PaddleMobileError.loaderError(message: "at least two operator") + } + + guard firstOp.type == gFeedType, lastOp.type == gFetchType else { + throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch") + } + + guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else { + throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found") + } + guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else { + throw PaddleMobileError.loaderError(message: "feed key or fetch key not found") + } + + let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey) + + // to load memory + for block in programDesc.blocks { + for varDesc in block.vars { + if (varDesc.type == .LodTensor) { + guard let tensorDesc = varDesc.tensorDesc else { + throw PaddleMobileError.loaderError(message: "get tensor desc failed") } - guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else { - throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found") - } - guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else { - throw PaddleMobileError.loaderError(message: "feed key or fetch key not found") + if (varDesc.persistable + && varDesc.type != .FeedMiniBatch + && varDesc.type != .FetchList) { + let dimArr = tensorDesc.dims + + guard dimArr.count > 0 else { + throw PaddleMobileError.loaderError(message: "tensor desc dim size error") + } + + let dim = Dim.init(inDim: dimArr) + let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout) + do { + try paraLoader.read(tensor: tensor) + } catch let error { + throw error + } + tensor.convert(to: DataLayout.NHWC()) + // tensor.initBuffer(device: device) + scope[varDesc.name] = tensor + } else { + let dim = Dim.init(inDim: tensorDesc.dims) + scope[varDesc.name] = Texture

.init(device: device, inDim: dim) } - - let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey) - - // to load memory - for block in programDesc.blocks { - for varDesc in block.vars { - if (varDesc.type == .LodTensor) { - guard let tensorDesc = varDesc.tensorDesc else { - throw PaddleMobileError.loaderError(message: "get tensor desc failed") - } - - if (varDesc.persistable - && varDesc.type != .FeedMiniBatch - && varDesc.type != .FetchList) { - let dimArr = tensorDesc.dims - - guard dimArr.count > 0 else { - throw PaddleMobileError.loaderError(message: "tensor desc dim size error") - } - - let dim = Dim.init(inDim: dimArr) - let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout) - do { - try paraLoader.read(tensor: tensor) - } catch let error { - throw error - } - tensor.convert(to: DataLayout.NHWC()) -// tensor.initBuffer(device: device) - scope[varDesc.name] = tensor - } else { - let dim = Dim.init(inDim: tensorDesc.dims) - scope[varDesc.name] = Texture

.init(device: device, inDim: dim) - } - } else { - if varDesc.name == fetchKey { - scope[varDesc.name] = ResultHolder

.init(inDim: [], inResult: [], inElapsedTime: 0.0) - } else if varDesc.name == feedKey { - } - } - } + } else { + if varDesc.name == fetchKey { + scope[varDesc.name] = ResultHolder

.init(inDim: [], inResult: [], inElapsedTime: 0.0) + } else if varDesc.name == feedKey { } - - let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope) - - return program - } catch _ { - throw PaddleMobileError.loaderError(message: "protobuf decoder error") + } } + } + + let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope) + + return program + } catch _ { + throw PaddleMobileError.loaderError(message: "protobuf decoder error") } + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift index e79707a475..c42e5fa1d8 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift @@ -97,6 +97,13 @@ class ConvAddOp: Operator, ConvAddParam

>, } func delogOutput() { + print("stride: ") + print(para.stride) + print("dilations: ") + print(para.dilations) + + + print(" \(type) output: ") print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift index cf67ef7ccf..ba83de1bf8 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift @@ -43,8 +43,15 @@ class ConvTransposeOp: Operator, ConvTr } func delogOutput() { - print("conv transpose delog") - let _: P? = para.input.metalTexture.logDesc(header: "conv transpose input: ", stridable: true) - let _: P? = para.output.metalTexture.logDesc(header: "conv transpose output: ", stridable: true) + print(" \(type) output: ") + let originDim = para.output.originDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + } else { + print(" not implement") + } } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift index a6c3162155..4812f05182 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift @@ -61,11 +61,18 @@ class ElementwiseAddOp: Operator, Elem print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray()) print(" \(type) inputY: ") print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) + print(" \(type) output: ") let originDim = para.output.originDim - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) - print(outputArray.strideArray()) - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + if para.output.transpose == [0, 1, 2, 3] { + let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + } else { + print(" not implement") + } + } func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift index f2bc495158..eabadc9d44 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable print("offset y: \(offsetY)") let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift index 3c999052fc..83dd4f996a 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift @@ -27,9 +27,10 @@ class ConvAddKernel: Kernel, Computable { param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1]) - let offsetX = param.filter.width/2 - Int(param.paddings[0]) - let offsetY = param.filter.height/2 - Int(param.paddings[1]) + let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) + param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) @@ -37,7 +38,11 @@ class ConvAddKernel: Kernel, Computable { print("offset y: \(offsetY)") let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + print("metal param: ") + print(inMetalParam) + + metalParam = inMetalParam } func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift index 2878d82879..c5d3ffe6c9 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift @@ -81,7 +81,7 @@ class ConvBNReluKernel: Kernel, Computable, Testable { let offsetZ = 0.0 print(" fuck ") - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift index 60ce245a78..680beba1ea 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift @@ -21,6 +21,8 @@ public struct MetalConvParam { let strideX: UInt16 let strideY: UInt16 let paddedZ: UInt16 + let dilationX: UInt16 + let dilationY: UInt16 } class ConvKernel: Kernel, Computable { @@ -39,7 +41,7 @@ class ConvKernel: Kernel, Computable { let offsetZ = 0.0 param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) } func compute(commandBuffer: MTLCommandBuffer, param: ConvParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift index 81aa9d2ff0..9354972d71 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift @@ -45,6 +45,7 @@ class ConvTransposeKernel: Kernel, Computable{ metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) param.output.initTexture(device: device, inTranspose: param.input.transpose) + param.filter.initBuffer(device: device) } func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift index 884b2ae3c3..c80df94bb5 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift @@ -55,7 +55,7 @@ class ElementwiseAddKernel: Kernel, Computable { } emp.yoff = 4 - Int32(param.inputY.tensorDim.cout()) if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { - print("===> elementwise_add fast!!!") +// print("===> elementwise_add fast!!!") emp.fast = 1 } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal index 06c93da59d..a17366cbe0 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal @@ -16,198 +16,198 @@ using namespace metal; struct MetalConvParam { - short offsetX; - short offsetY; - short offsetZ; - ushort strideX; - ushort strideY; + short offsetX; + short offsetY; + short offsetZ; + ushort strideX; + ushort strideY; + ushort dilationX; + ushort dilationY; }; - kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = half4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); - half4 output = half4(0.0); + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); - output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); - outTexture.write(output, gid.xy, gid.z); + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = half4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = half4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); } - output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); - outTexture.write(output, gid.xy, gid.z); + } + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = half4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); - half4 output = half4(0.0); + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); } kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - half4 output = half4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + half4 output = half4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); } @@ -223,41 +223,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array const device float4 *new_scale [[buffer(3)]], const device float4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); - float4 output = float4(0.0); + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], @@ -268,92 +268,92 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array const device float4 *new_scale [[buffer(3)]], const device float4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); } // conv @@ -447,10 +447,10 @@ kernel void depthwise_conv_3x3(texture2d_array inTexture } kernel void conv_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || @@ -532,13 +532,13 @@ kernel void conv_add_1x1(texture2d_array inTexture [[text } kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || @@ -556,17 +556,20 @@ kernel void conv_add_3x3(texture2d_array inTexture [[text float4 output = float4(0.0); + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + float4 input[9]; for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); for (int j = 0; j < 9; ++j) { float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.x += dot(input[j], weight_x); @@ -586,13 +589,13 @@ kernel void conv_add_3x3(texture2d_array inTexture [[text } kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || @@ -629,12 +632,12 @@ kernel void depthwise_conv_add_3x3(texture2d_array inText #pragma mark - conv bn relu kernel void conv_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || @@ -673,12 +676,12 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array inTe } kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || @@ -726,12 +729,12 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array inTe } kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || diff --git a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift index 8e1915a497..f1a94e4e2a 100644 --- a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift +++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift @@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible { var description: String { var str = "" - for op in ops { - str += op.description + for i in 0.. Int - var layout: DataLayout { get } + var dim: Dim { get set } + func numel() -> Int + var layout: DataLayout { get } } extension Tensorial { - func numel() -> Int { - return dim.numel() - } + func numel() -> Int { + return dim.numel() + } } class Tensor: Tensorial { - enum BufferPrecision { - case Float32, Float16 + enum BufferPrecision { + case Float32, Float16 + } + + var data: Data + var dim: Dim + var buffer: MTLBuffer! + private(set) var layout: DataLayout + + class Data { + init(inSize: Int, inPointer: UnsafeMutablePointer

) { + size = inSize + pointer = inPointer } - - var data: Data - var dim: Dim - var buffer: MTLBuffer! - private(set) var layout: DataLayout - - class Data { - init(inSize: Int, inPointer: UnsafeMutablePointer

) { - size = inSize - pointer = inPointer - } - let size: Int - var pointer: UnsafeMutablePointer

- subscript(index: Int) -> P{ - get { - return pointer[index] - } - set { - pointer[index] = newValue - } - } - func release() { - pointer.deinitialize(count: size) - pointer.deallocate() - } - deinit { -// release() - } + let size: Int + var pointer: UnsafeMutablePointer

+ subscript(index: Int) -> P{ + get { + return pointer[index] + } + set { + pointer[index] = newValue + } } - - required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { - dim = inDim - let size = inDim.numel() * MemoryLayout

.size - let pointer = UnsafeMutablePointer

.allocate(capacity: size) - data = Data.init(inSize: size, inPointer: pointer) - layout = inLayout + func release() { + pointer.deinitialize(count: size) + pointer.deallocate() + } + deinit { + // release() + } + } + + required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { + dim = inDim + let size = inDim.numel() * MemoryLayout

.size + let pointer = UnsafeMutablePointer

.allocate(capacity: size) + data = Data.init(inSize: size, inPointer: pointer) + layout = inLayout + } + + func convert(to: DataLayout) { + guard to != layout else { + return } - func convert(to: DataLayout) { - guard to != layout else { - return - } - - guard dim.cout() == 4 else { - return - } - - guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else { - // other not support - return - } - let newPointer = UnsafeMutablePointer

.allocate(capacity: data.size) - - if layout == DataLayout.NCHW() { - NCHW2NHWC(newPtr: newPointer) - } - - data.release() - data.pointer = newPointer - layout = to + guard dim.cout() == 4 else { + return } - func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) { - var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) - var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) - guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { - fatalError(" float 32 to float 16 error ! ") - } + guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else { + // other not support + return } + let newPointer = UnsafeMutablePointer

.allocate(capacity: data.size) - func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) { - guard let floatPointer = data.pointer as? UnsafeMutablePointer else { - fatalError(" not support yet ") - } - - let precisionSize: Int - switch precision { - case .Float32: - precisionSize = 4 - case .Float16: - precisionSize = 2 - } - - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - let C = dim[3] - let cSlices = (C + 3) / 4 - let paddedC = cSlices * 4 - let count = paddedC * dim[0] * dim[1] * dim[2] - if C == paddedC { - buffer = device.makeBuffer(length: count * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) - } - } else if C == 1 { - buffer = device.makeBuffer(length: numel() * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) - } - } else { - buffer = device.makeBuffer(length: count * precisionSize) - let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) - var tmpPointer = floatPointer - var dstPtr = convertedPointer - for _ in 0...stride) - case .Float16: - float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) - } - - convertedPointer.deinitialize(count: count) - convertedPointer.deallocate() - } - } - } else if dim.cout() == 1 { - buffer = device.makeBuffer(length: numel() * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) - } - } else { - fatalError(" not support !") - } - //TODO: release - data.release() + if layout == DataLayout.NCHW() { + NCHW2NHWC(newPtr: newPointer) } - var width: Int { - get { - if dim.cout() == 4 { - return dim[1] - } else { - fatalError() - } - } + data.release() + data.pointer = newPointer + layout = to + } + + func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) { + var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) + var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) + guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { + fatalError(" float 32 to float 16 error ! ") + } + } + + func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) { + guard let floatPointer = data.pointer as? UnsafeMutablePointer else { + fatalError(" not support yet ") } - var height: Int { - get { - if dim.cout() == 4 { - return dim[2] - } else { - fatalError() - } - } + let precisionSize: Int + switch precision { + case .Float32: + precisionSize = 4 + case .Float16: + precisionSize = 2 } - var channel: Int { - get { - if dim.cout() == 4 { - return dim[3] - } else { - fatalError() + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + let C = dim[3] + let cSlices = (C + 3) / 4 + let paddedC = cSlices * 4 + let count = paddedC * dim[0] * dim[1] * dim[2] + if C == paddedC { + buffer = device.makeBuffer(length: count * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) + } + } else if C == 1 { + buffer = device.makeBuffer(length: numel() * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) + } + } else { + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) + var tmpPointer = floatPointer + var dstPtr = convertedPointer + for _ in 0...stride) + case .Float16: + float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) + } + + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() } + } + } else if dim.cout() == 1 { + let num = ((numel() + 3) / 4) * 4 + buffer = device.makeBuffer(length: num * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num) + } + } else { + fatalError(" not support !") } - + //TODO: release + data.release() + } + + var width: Int { + get { + if dim.cout() == 4 { + return dim[1] + } else { + fatalError() + } + } + } + + var height: Int { + get { + if dim.cout() == 4 { + return dim[2] + } else { + fatalError() + } + } + } + + var channel: Int { + get { + if dim.cout() == 4 { + return dim[3] + } else { + fatalError() + } + } + } + + + func NCHW2NHWC(newPtr: UnsafeMutablePointer

) { + let N = dim[0] + let C = dim[1] + let H = dim[2] + let W = dim[3] + let HXW = H * W + let CXHXW = C * H * W - func NCHW2NHWC(newPtr: UnsafeMutablePointer

) { - let N = dim[0] - let C = dim[1] - let H = dim[2] - let W = dim[3] - let HXW = H * W - let CXHXW = C * H * W - - var index: Int = 0 - for n in 0...size { - str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])" - } - return str + + var debugDescription: String { + var str = "dim: \(dim) \n" + str += "MTLBuffer: \(self.buffer) \n" + for i in 0...size { + str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])" } - - func logDataPointer(header: String = "") { - print(header) - var str = "" - str += "data size: \(data.size) \n" - str += "dim: \(dim) \n" - for i in 0..