提交 6ec031ff 编写于 作者: L liuruilong

add dilation

上级 ee6ef4d9
...@@ -26,7 +26,7 @@ let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand. ...@@ -26,7 +26,7 @@ let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.
enum SupportModel: String{ enum SupportModel: String{
// case mobilenet = "mobilenet" // case mobilenet = "mobilenet"
case mobilenet_ssd = "mobilenetssd" case mobilenet_ssd = "mobilenetssd"
case genet = "enet" case genet = "genet"
static func supportedModels() -> [SupportModel] { static func supportedModels() -> [SupportModel] {
//.mobilenet, //.mobilenet,
return [.mobilenet_ssd ,.genet] return [.mobilenet_ssd ,.genet]
...@@ -79,7 +79,7 @@ class ViewController: UIViewController { ...@@ -79,7 +79,7 @@ class ViewController: UIViewController {
return return
} }
do { do {
let max = 1 let max = 10
let startDate = Date.init() let startDate = Date.init()
for i in 0..<max { for i in 0..<max {
try net.predict(inTexture: inTexture) { [weak self] (result) in try net.predict(inTexture: inTexture) { [weak self] (result) in
...@@ -87,6 +87,7 @@ class ViewController: UIViewController { ...@@ -87,6 +87,7 @@ class ViewController: UIViewController {
fatalError() fatalError()
} }
print(result.resultArray)
if i == max - 1 { if i == max - 1 {
let time = Date.init().timeIntervalSince(startDate) let time = Date.init().timeIntervalSince(startDate)
DispatchQueue.main.async { DispatchQueue.main.async {
......
...@@ -699,6 +699,7 @@ ...@@ -699,6 +699,7 @@
"@executable_path/Frameworks", "@executable_path/Frameworks",
"@loader_path/Frameworks", "@loader_path/Frameworks",
); );
MACH_O_TYPE = mh_dylib;
MTL_LANGUAGE_REVISION = UseDeploymentTarget; MTL_LANGUAGE_REVISION = UseDeploymentTarget;
PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile"; PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
...@@ -727,6 +728,7 @@ ...@@ -727,6 +728,7 @@
"@executable_path/Frameworks", "@executable_path/Frameworks",
"@loader_path/Frameworks", "@loader_path/Frameworks",
); );
MACH_O_TYPE = mh_dylib;
MTL_LANGUAGE_REVISION = UseDeploymentTarget; MTL_LANGUAGE_REVISION = UseDeploymentTarget;
PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile"; PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
......
...@@ -342,7 +342,7 @@ public extension MTLTexture { ...@@ -342,7 +342,7 @@ public extension MTLTexture {
// n c h w - dim // n c h w - dim
func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] { func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
print("origin dim: \(dim)") // print("origin dim: \(dim)")
print("texture: ") print("texture: ")
print(self) print(self)
......
...@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest { ...@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest {
let offsetX = filterSize.width/2 - paddings.0 let offsetX = filterSize.width/2 - paddings.0
let offsetY = filterSize.height/2 - paddings.1 let offsetY = filterSize.height/2 - paddings.1
let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0)) let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0), dilationX: UInt16(1), dilationY: UInt16(1))
let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import Foundation import Foundation
let testTo = 12 let testTo = 54
public class ResultHolder<P: PrecisionType> { public class ResultHolder<P: PrecisionType> {
public let dim: [Int] public let dim: [Int]
...@@ -62,7 +62,7 @@ public class Executor<P: PrecisionType> { ...@@ -62,7 +62,7 @@ public class Executor<P: PrecisionType> {
queue = inQueue queue = inQueue
for block in inProgram.programDesc.blocks { for block in inProgram.programDesc.blocks {
//block.ops.count //block.ops.count
for i in 0..<testTo { for i in 0..<block.ops.count {
let op = block.ops[i] let op = block.ops[i]
do { do {
let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope) let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
...@@ -124,13 +124,13 @@ public class Executor<P: PrecisionType> { ...@@ -124,13 +124,13 @@ public class Executor<P: PrecisionType> {
// print(stridableInput) // print(stridableInput)
// let _: Flo? = input.logDesc(header: "input: ", stridable: true) // let _: Flo? = input.logDesc(header: "input: ", stridable: true)
for i in 0..<self.ops.count { // for i in 0..<self.ops.count {
let op = self.ops[i] // let op = self.ops[i]
print(" 第 \(i) 个 op: ") // print(" 第 \(i) 个 op: ")
op.delogOutput() // op.delogOutput()
} // }
//
return // return
let afterDate = Date.init() let afterDate = Date.init()
...@@ -145,7 +145,6 @@ public class Executor<P: PrecisionType> { ...@@ -145,7 +145,6 @@ public class Executor<P: PrecisionType> {
return p return p
}), inElapsedTime: afterDate.timeIntervalSince(beforeDate)) }), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
} }
completionHandle(resultHolder) completionHandle(resultHolder)
} }
......
...@@ -16,168 +16,170 @@ import Foundation ...@@ -16,168 +16,170 @@ import Foundation
import SwiftProtobuf import SwiftProtobuf
public class Loader<P: PrecisionType> { public class Loader<P: PrecisionType> {
class ParaLoader { class ParaLoader {
let file: UnsafeMutablePointer<FILE> let file: UnsafeMutablePointer<FILE>
let fileSize: Int let fileSize: Int
var nowIndex: Int var nowIndex: Int
init(paramPath: String) throws { init(paramPath: String) throws {
guard let tmpFile = fopen(paramPath, "rb") else { guard let tmpFile = fopen(paramPath, "rb") else {
throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
} }
file = tmpFile file = tmpFile
fseek(file, 0, SEEK_END) fseek(file, 0, SEEK_END)
fileSize = ftell(file) fileSize = ftell(file)
guard fileSize > 0 else { guard fileSize > 0 else {
throw PaddleMobileError.loaderError(message: "param file size is too small") throw PaddleMobileError.loaderError(message: "param file size is too small")
} }
rewind(file) rewind(file)
nowIndex = 0 nowIndex = 0
}
func read(tensor: Tensor<P>) throws {
guard nowIndex <= fileSize else {
throw PaddleMobileError.loaderError(message: "out of the file range")
}
func pointerReader<T>(type: T.Type) -> T {
let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
fread(ptr, 1, MemoryLayout<T>.size, file)
nowIndex += MemoryLayout<T>.size
let pointee = ptr.pointee
ptr.deinitialize(count: MemoryLayout<UInt32>.size)
ptr.deallocate()
return pointee
}
let _ = pointerReader(type: UInt32.self)
let lodLevel = pointerReader(type: UInt64.self)
for _ in 0..<lodLevel {
let size = pointerReader(type: UInt64.self)
for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
_ = pointerReader(type: size_t.self)
}
}
let _ = pointerReader(type: UInt32.self)
let tensorDescSize = pointerReader(type: Int32.self)
fseek(file, Int(tensorDescSize), SEEK_CUR)
nowIndex += Int(tensorDescSize)
/*
这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
*/
//现在模型传入模型为 Float 类型, 这块应该根据模型来
// let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
// let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
guard bytesRead == tensor.data.size else {
throw PaddleMobileError.loaderError(message: "param read size error")
}
// TODO: use script to convert
// let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
// for i in 0..<tensor.numel() {
// tensor.data[i] = P.init(inFloat: tmpPointer[i])
// }
// tmpPointer.deinitialize(count: tmpCapacity)
// tmpPointer.deallocate()
nowIndex += bytesRead
}
deinit {
fclose(file)
}
} }
public init(){}
public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{ func read(tensor: Tensor<P>) throws {
guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else { guard nowIndex <= fileSize else {
throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !") throw PaddleMobileError.loaderError(message: "out of the file range")
}
func pointerReader<T>(type: T.Type) -> T {
let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
fread(ptr, 1, MemoryLayout<T>.size, file)
nowIndex += MemoryLayout<T>.size
let pointee = ptr.pointee
ptr.deinitialize(count: MemoryLayout<UInt32>.size)
ptr.deallocate()
return pointee
}
let _ = pointerReader(type: UInt32.self)
let lodLevel = pointerReader(type: UInt64.self)
for _ in 0..<lodLevel {
let size = pointerReader(type: UInt64.self)
for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
_ = pointerReader(type: size_t.self)
} }
}
do {
let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init( let _ = pointerReader(type: UInt32.self)
serializedData: modelData)
let tensorDescSize = pointerReader(type: Int32.self)
let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc) fseek(file, Int(tensorDescSize), SEEK_CUR)
print(programDesc) nowIndex += Int(tensorDescSize)
guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else { /*
throw PaddleMobileError.loaderError(message: "load para error") 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
} */
guard programDesc.blocks.count > 0 else { //现在模型传入模型为 Float 类型, 这块应该根据模型来
throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0") // let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
} // let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
// to get feed key and fetch key
let block = programDesc.blocks[0] guard bytesRead == tensor.data.size else {
guard let firstOp = block.ops.first, let lastOp = block.ops.last else { throw PaddleMobileError.loaderError(message: "param read size error")
throw PaddleMobileError.loaderError(message: "at least two operator") }
}
guard firstOp.type == gFeedType, lastOp.type == gFetchType else { // TODO: use script to convert
throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch") // let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
// for i in 0..<tensor.numel() {
// tensor.data[i] = P.init(inFloat: tmpPointer[i])
// }
// tmpPointer.deinitialize(count: tmpCapacity)
// tmpPointer.deallocate()
nowIndex += bytesRead
}
deinit {
fclose(file)
}
}
public init(){}
public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
}
do {
let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
serializedData: modelData)
let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
print(programDesc)
guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
throw PaddleMobileError.loaderError(message: "load para error")
}
guard programDesc.blocks.count > 0 else {
throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
}
// to get feed key and fetch key
let block = programDesc.blocks[0]
guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
throw PaddleMobileError.loaderError(message: "at least two operator")
}
guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
}
guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
}
guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
}
let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
// to load memory
for block in programDesc.blocks {
for varDesc in block.vars {
if (varDesc.type == .LodTensor) {
guard let tensorDesc = varDesc.tensorDesc else {
throw PaddleMobileError.loaderError(message: "get tensor desc failed")
} }
guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else { if (varDesc.persistable
throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found") && varDesc.type != .FeedMiniBatch
} && varDesc.type != .FetchList) {
guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else { let dimArr = tensorDesc.dims
throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
guard dimArr.count > 0 else {
throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
}
let dim = Dim.init(inDim: dimArr)
let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
do {
try paraLoader.read(tensor: tensor)
} catch let error {
throw error
}
tensor.convert(to: DataLayout.NHWC())
// tensor.initBuffer(device: device)
scope[varDesc.name] = tensor
} else {
let dim = Dim.init(inDim: tensorDesc.dims)
scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
} }
} else {
let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey) if varDesc.name == fetchKey {
scope[varDesc.name] = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: 0.0)
// to load memory } else if varDesc.name == feedKey {
for block in programDesc.blocks {
for varDesc in block.vars {
if (varDesc.type == .LodTensor) {
guard let tensorDesc = varDesc.tensorDesc else {
throw PaddleMobileError.loaderError(message: "get tensor desc failed")
}
if (varDesc.persistable
&& varDesc.type != .FeedMiniBatch
&& varDesc.type != .FetchList) {
let dimArr = tensorDesc.dims
guard dimArr.count > 0 else {
throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
}
let dim = Dim.init(inDim: dimArr)
let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
do {
try paraLoader.read(tensor: tensor)
} catch let error {
throw error
}
tensor.convert(to: DataLayout.NHWC())
// tensor.initBuffer(device: device)
scope[varDesc.name] = tensor
} else {
let dim = Dim.init(inDim: tensorDesc.dims)
scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
}
} else {
if varDesc.name == fetchKey {
scope[varDesc.name] = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: 0.0)
} else if varDesc.name == feedKey {
}
}
}
} }
}
let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope)
return program
} catch _ {
throw PaddleMobileError.loaderError(message: "protobuf decoder error")
} }
}
let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope)
return program
} catch _ {
throw PaddleMobileError.loaderError(message: "protobuf decoder error")
} }
}
} }
...@@ -97,6 +97,13 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, ...@@ -97,6 +97,13 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
} }
func delogOutput() { func delogOutput() {
print("stride: ")
print(para.stride)
print("dilations: ")
print(para.dilations)
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} }
......
...@@ -43,8 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr ...@@ -43,8 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
} }
func delogOutput() { func delogOutput() {
print("conv transpose delog") print(" \(type) output: ")
let _: P? = para.input.metalTexture.logDesc(header: "conv transpose input: ", stridable: true) let originDim = para.output.originDim
let _: P? = para.output.metalTexture.logDesc(header: "conv transpose output: ", stridable: true) if para.output.transpose == [0, 1, 2, 3] {
let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} else {
print(" not implement")
}
} }
} }
...@@ -61,11 +61,18 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem ...@@ -61,11 +61,18 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray()) print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
print(" \(type) inputY: ") print(" \(type) inputY: ")
print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
print(" \(type) output: ") print(" \(type) output: ")
let originDim = para.output.originDim let originDim = para.output.originDim
let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) if para.output.transpose == [0, 1, 2, 3] {
print(outputArray.strideArray()) let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} else {
print(" not implement")
}
} }
func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
......
...@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable ...@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
print("offset y: \(offsetY)") print("offset y: \(offsetY)")
let offsetZ = 0.0 let offsetZ = 0.0
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -27,9 +27,10 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -27,9 +27,10 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1]) param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
...@@ -37,7 +38,11 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -37,7 +38,11 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
print("offset y: \(offsetY)") print("offset y: \(offsetY)")
let offsetZ = 0.0 let offsetZ = 0.0
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
print("metal param: ")
print(inMetalParam)
metalParam = inMetalParam
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
......
...@@ -81,7 +81,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable { ...@@ -81,7 +81,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
let offsetZ = 0.0 let offsetZ = 0.0
print(" fuck ") print(" fuck ")
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
var invs: [P] = [] var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
......
...@@ -21,6 +21,8 @@ public struct MetalConvParam { ...@@ -21,6 +21,8 @@ public struct MetalConvParam {
let strideX: UInt16 let strideX: UInt16
let strideY: UInt16 let strideY: UInt16
let paddedZ: UInt16 let paddedZ: UInt16
let dilationX: UInt16
let dilationY: UInt16
} }
class ConvKernel<P: PrecisionType>: Kernel, Computable { class ConvKernel<P: PrecisionType>: Kernel, Computable {
...@@ -39,7 +41,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable { ...@@ -39,7 +41,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
let offsetZ = 0.0 let offsetZ = 0.0
param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
......
...@@ -45,6 +45,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -45,6 +45,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
param.output.initTexture(device: device, inTranspose: param.input.transpose) param.output.initTexture(device: device, inTranspose: param.input.transpose)
param.filter.initBuffer(device: device)
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
......
...@@ -55,7 +55,7 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -55,7 +55,7 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
} }
emp.yoff = 4 - Int32(param.inputY.tensorDim.cout()) emp.yoff = 4 - Int32(param.inputY.tensorDim.cout())
if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
print("===> elementwise_add fast!!!") // print("===> elementwise_add fast!!!")
emp.fast = 1 emp.fast = 1
} }
......
...@@ -16,198 +16,198 @@ ...@@ -16,198 +16,198 @@
using namespace metal; using namespace metal;
struct MetalConvParam { struct MetalConvParam {
short offsetX; short offsetX;
short offsetY; short offsetY;
short offsetZ; short offsetZ;
ushort strideX; ushort strideX;
ushort strideY; ushort strideY;
ushort dilationX;
ushort dilationY;
}; };
kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]], const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
return; return;
} }
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 stride = ushort2(param.strideX, param.strideY); ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); const uint kernelHXW = 1;
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
half4 output = half4(0.0); half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 input; half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
for (uint i = 0; i < input_arr_size; ++i) { output.z += dot(input, weight_z);
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
outTexture.write(output, gid.xy, gid.z); output.w += dot(input, weight_w);
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
} }
kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]], const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
return; return;
} }
ushort2 stride = ushort2(param.strideX, param.strideY); ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0); half4 output = half4(0.0);
half4 input[9]; half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
}
} }
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); }
outTexture.write(output, gid.xy, gid.z); output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
} }
kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]], const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]], const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
return; return;
} }
ushort2 stride = ushort2(param.strideX, param.strideY); ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1; const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
uint input_arr_size = inTexture.get_array_size(); half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; output.y += dot(input, weight_y);
half4 output = half4(0.0); half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 input; half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
for (uint i = 0; i < input_arr_size; ++i) { output.w += dot(input, weight_w);
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); }
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; output = output + biase[gid.z];
output.x += dot(input, weight_x); outTexture.write(output, gid.xy, gid.z);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = output + biase[gid.z];
outTexture.write(output, gid.xy, gid.z);
} }
kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]], texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]], const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]], const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
return; return;
} }
uint output_slice = gid.z; uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY); ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
half4 output = half4(0.0); half4 output = half4(0.0);
half4 inputs[9]; half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
half4 input = inputs[j]; half4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
} }
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
...@@ -223,41 +223,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> ...@@ -223,41 +223,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
if (gid.x >= outTexture.get_width() || float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
gid.y >= outTexture.get_height() || output.y += dot(input, weight_y);
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0); float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 input; float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
for (uint i = 0; i < input_arr_size; ++i) { output.w += dot(input, weight_w);
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); }
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
output.x += dot(input, weight_x); outTexture.write(output, gid.xy, gid.z);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
} }
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
...@@ -268,92 +268,92 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> ...@@ -268,92 +268,92 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
return; return;
} }
ushort2 stride = ushort2(param.strideX, param.strideY); ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size(); uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4; uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0); float4 output = float4(0.0);
float4 input[9]; float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y); output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z); output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w); output.w += dot(input[j], weight_w);
}
} }
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); }
outTexture.write(output, gid.xy, gid.z); output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
} }
kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]], const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]], const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
return; return;
} }
uint output_slice = gid.z; uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY); ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4; uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0); float4 output = float4(0.0);
float4 inputs[9]; float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
float4 input = inputs[j]; float4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
} }
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
// conv // conv
...@@ -447,10 +447,10 @@ kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture ...@@ -447,10 +447,10 @@ kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture
} }
kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -532,13 +532,13 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text ...@@ -532,13 +532,13 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
} }
kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]], const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -556,17 +556,20 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text ...@@ -556,17 +556,20 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
float4 output = float4(0.0); float4 output = float4(0.0);
ushort dilation_x = param.dilationX;
ushort dilation_y = param.dilationY;
float4 input[9]; float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) { for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x); output.x += dot(input[j], weight_x);
...@@ -586,13 +589,13 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text ...@@ -586,13 +589,13 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
} }
kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]], const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]], const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -629,12 +632,12 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText ...@@ -629,12 +632,12 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
#pragma mark - conv bn relu #pragma mark - conv bn relu
kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *new_scale [[buffer(2)]], const device float4 *new_scale [[buffer(2)]],
const device float4 *new_biase [[buffer(3)]], const device float4 *new_biase [[buffer(3)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -673,12 +676,12 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTe ...@@ -673,12 +676,12 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTe
} }
kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *new_scale [[buffer(2)]], const device float4 *new_scale [[buffer(2)]],
const device float4 *new_biase [[buffer(3)]], const device float4 *new_biase [[buffer(3)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
...@@ -726,12 +729,12 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTe ...@@ -726,12 +729,12 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTe
} }
kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float *weights [[buffer(1)]], const device float *weights [[buffer(1)]],
const device float4 *new_scale [[buffer(2)]], const device float4 *new_scale [[buffer(2)]],
const device float4 *new_biase [[buffer(3)]], const device float4 *new_biase [[buffer(3)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
......
...@@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible { ...@@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
var description: String { var description: String {
var str = "" var str = ""
for op in ops { for i in 0..<ops.count {
str += op.description str += " op \(i): "
let op = ops[i]
str += op.description
} }
for varDesc in vars { for varDesc in vars {
......
...@@ -16,245 +16,246 @@ import Accelerate ...@@ -16,245 +16,246 @@ import Accelerate
import Foundation import Foundation
protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{ protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
var dim: Dim { get set } var dim: Dim { get set }
func numel() -> Int func numel() -> Int
var layout: DataLayout { get } var layout: DataLayout { get }
} }
extension Tensorial { extension Tensorial {
func numel() -> Int { func numel() -> Int {
return dim.numel() return dim.numel()
} }
} }
class Tensor<P: PrecisionType>: Tensorial { class Tensor<P: PrecisionType>: Tensorial {
enum BufferPrecision { enum BufferPrecision {
case Float32, Float16 case Float32, Float16
}
var data: Data
var dim: Dim
var buffer: MTLBuffer!
private(set) var layout: DataLayout
class Data {
init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
size = inSize
pointer = inPointer
} }
let size: Int
var data: Data var pointer: UnsafeMutablePointer<P>
var dim: Dim subscript(index: Int) -> P{
var buffer: MTLBuffer! get {
private(set) var layout: DataLayout return pointer[index]
}
class Data { set {
init(inSize: Int, inPointer: UnsafeMutablePointer<P>) { pointer[index] = newValue
size = inSize }
pointer = inPointer
}
let size: Int
var pointer: UnsafeMutablePointer<P>
subscript(index: Int) -> P{
get {
return pointer[index]
}
set {
pointer[index] = newValue
}
}
func release() {
pointer.deinitialize(count: size)
pointer.deallocate()
}
deinit {
// release()
}
} }
func release() {
required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { pointer.deinitialize(count: size)
dim = inDim pointer.deallocate()
let size = inDim.numel() * MemoryLayout<P>.size }
let pointer = UnsafeMutablePointer<P>.allocate(capacity: size) deinit {
data = Data.init(inSize: size, inPointer: pointer) // release()
layout = inLayout }
}
required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
dim = inDim
let size = inDim.numel() * MemoryLayout<P>.size
let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
data = Data.init(inSize: size, inPointer: pointer)
layout = inLayout
}
func convert(to: DataLayout) {
guard to != layout else {
return
} }
func convert(to: DataLayout) { guard dim.cout() == 4 else {
guard to != layout else { return
return
}
guard dim.cout() == 4 else {
return
}
guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
// other not support
return
}
let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
if layout == DataLayout.NCHW() {
NCHW2NHWC(newPtr: newPointer)
}
data.release()
data.pointer = newPointer
layout = to
} }
func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) { guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) // other not support
var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) return
guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
fatalError(" float 32 to float 16 error ! ")
}
} }
let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) { if layout == DataLayout.NCHW() {
guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else { NCHW2NHWC(newPtr: newPointer)
fatalError(" not support yet ")
}
let precisionSize: Int
switch precision {
case .Float32:
precisionSize = 4
case .Float16:
precisionSize = 2
}
if dim.cout() == 4 {
if layout == DataLayout.NHWC() {
let C = dim[3]
let cSlices = (C + 3) / 4
let paddedC = cSlices * 4
let count = paddedC * dim[0] * dim[1] * dim[2]
if C == paddedC {
buffer = device.makeBuffer(length: count * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
}
} else if C == 1 {
buffer = device.makeBuffer(length: numel() * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
}
} else {
buffer = device.makeBuffer(length: count * precisionSize)
let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
var tmpPointer = floatPointer
var dstPtr = convertedPointer
for _ in 0..<dim[0] * dim[1] * dim[2] {
for j in 0..<paddedC {
if j < C {
dstPtr[j] = tmpPointer[j]
}
}
tmpPointer += C
dstPtr += paddedC
}
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
}
convertedPointer.deinitialize(count: count)
convertedPointer.deallocate()
}
}
} else if dim.cout() == 1 {
buffer = device.makeBuffer(length: numel() * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
}
} else {
fatalError(" not support !")
}
//TODO: release
data.release()
} }
var width: Int { data.release()
get { data.pointer = newPointer
if dim.cout() == 4 { layout = to
return dim[1] }
} else {
fatalError() func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
} var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4)
} var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
fatalError(" float 32 to float 16 error ! ")
}
}
func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
fatalError(" not support yet ")
} }
var height: Int { let precisionSize: Int
get { switch precision {
if dim.cout() == 4 { case .Float32:
return dim[2] precisionSize = 4
} else { case .Float16:
fatalError() precisionSize = 2
}
}
} }
var channel: Int { if dim.cout() == 4 {
get { if layout == DataLayout.NHWC() {
if dim.cout() == 4 { let C = dim[3]
return dim[3] let cSlices = (C + 3) / 4
} else { let paddedC = cSlices * 4
fatalError() let count = paddedC * dim[0] * dim[1] * dim[2]
if C == paddedC {
buffer = device.makeBuffer(length: count * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
}
} else if C == 1 {
buffer = device.makeBuffer(length: numel() * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
}
} else {
buffer = device.makeBuffer(length: count * precisionSize)
let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
var tmpPointer = floatPointer
var dstPtr = convertedPointer
for _ in 0..<dim[0] * dim[1] * dim[2] {
for j in 0..<paddedC {
if j < C {
dstPtr[j] = tmpPointer[j]
}
} }
tmpPointer += C
dstPtr += paddedC
}
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
}
convertedPointer.deinitialize(count: count)
convertedPointer.deallocate()
} }
}
} else if dim.cout() == 1 {
let num = ((numel() + 3) / 4) * 4
buffer = device.makeBuffer(length: num * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
}
} else {
fatalError(" not support !")
} }
//TODO: release
data.release()
}
var width: Int {
get {
if dim.cout() == 4 {
return dim[1]
} else {
fatalError()
}
}
}
var height: Int {
get {
if dim.cout() == 4 {
return dim[2]
} else {
fatalError()
}
}
}
var channel: Int {
get {
if dim.cout() == 4 {
return dim[3]
} else {
fatalError()
}
}
}
func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
let N = dim[0]
let C = dim[1]
let H = dim[2]
let W = dim[3]
let HXW = H * W
let CXHXW = C * H * W
func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) { var index: Int = 0
let N = dim[0] for n in 0..<N {
let C = dim[1] for h in 0..<H{
let H = dim[2] for w in 0..<W{
let W = dim[3] for c in 0..<C{
let HXW = H * W newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
let CXHXW = C * H * W index += 1
}
var index: Int = 0
for n in 0..<N {
for h in 0..<H{
for w in 0..<W{
for c in 0..<C{
newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
index += 1
}
}
}
} }
dim.swapeDimAt(index1: 1, index2: 3) }
} }
dim.swapeDimAt(index1: 1, index2: 3)
}
} }
extension Tensor { extension Tensor {
var debugDescription: String { var debugDescription: String {
var str = "dim: \(dim) \n" var str = "dim: \(dim) \n"
str += "MTLBuffer: \(self.buffer) \n" str += "MTLBuffer: \(self.buffer) \n"
for i in 0..<buffer.length/MemoryLayout<P>.size { for i in 0..<buffer.length/MemoryLayout<P>.size {
str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])" str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
}
return str
} }
return str
func logDataPointer(header: String = "") { }
print(header)
var str = "" func logDataPointer(header: String = "") {
str += "data size: \(data.size) \n" print(header)
str += "dim: \(dim) \n" var str = ""
for i in 0..<numel() { str += "data size: \(data.size) \n"
str += " \(data.pointer[i])" str += "dim: \(dim) \n"
} for i in 0..<numel() {
print(str) str += " \(data.pointer[i])"
} }
print(str)
var description: String { }
return debugDescription
} var description: String {
return debugDescription
}
} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册