提交 afd4b73c 编写于 作者: L liuruilong

run mobilenet ssd and genet

上级 c5a91cbb
...@@ -68,6 +68,7 @@ class MobileNet_ssd_hand: Net{ ...@@ -68,6 +68,7 @@ class MobileNet_ssd_hand: Net{
let output: [Float32] = result.map { $0.floatValue } let output: [Float32] = result.map { $0.floatValue }
return output return output
} }
......
...@@ -19,17 +19,17 @@ import MetalPerformanceShaders ...@@ -19,17 +19,17 @@ import MetalPerformanceShaders
let threadSupport = [1] let threadSupport = [1]
let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()] let modelHelperMap: [SupportModel : Net] = [ .mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()]
//, .genet : Genet.init() //, .genet : Genet.init()
//let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()] //let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
enum SupportModel: String{ enum SupportModel: String{
// case mobilenet = "mobilenet" case mobilenet = "mobilenet"
case mobilenet_ssd = "mobilenetssd" case mobilenet_ssd = "mobilenetssd"
case genet = "genet" case genet = "genet"
static func supportedModels() -> [SupportModel] { static func supportedModels() -> [SupportModel] {
//.mobilenet, //
return [.mobilenet_ssd ,.genet] return [.mobilenet, .mobilenet_ssd ,.genet]
} }
} }
......
...@@ -41,7 +41,6 @@ ...@@ -41,7 +41,6 @@
FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; }; FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; };
FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; }; FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; }; FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; };
FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1B186520ECF1C600678B91 /* ResizeKernel.swift */; };
FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; }; FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; }; FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; }; FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
...@@ -133,7 +132,6 @@ ...@@ -133,7 +132,6 @@
FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; }; FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; };
FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; }; FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; };
FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; }; FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; };
FC1B186520ECF1C600678B91 /* ResizeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ResizeKernel.swift; sourceTree = "<group>"; };
FC27990D21341016000B6BAD /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; }; FC27990D21341016000B6BAD /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; };
FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; }; FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; };
FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; }; FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
...@@ -326,7 +324,6 @@ ...@@ -326,7 +324,6 @@
FCEB6837212F00B100D2448E /* metal */, FCEB6837212F00B100D2448E /* metal */,
FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */, FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */, FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
FC1B186520ECF1C600678B91 /* ResizeKernel.swift */,
FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */, FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */, FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */, FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
...@@ -506,7 +503,6 @@ ...@@ -506,7 +503,6 @@
FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */, FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */,
FC9D037920E229E4000F735A /* OpParam.swift in Sources */, FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */, FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */, FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */, FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */, FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
......
...@@ -354,7 +354,7 @@ public extension MTLTexture { ...@@ -354,7 +354,7 @@ public extension MTLTexture {
} }
// n c h w - dim // n c h w - dim
func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
// print("origin dim: \(dim)") // print("origin dim: \(dim)")
print("texture: ") print("texture: ")
print(self) print(self)
...@@ -392,7 +392,7 @@ public extension MTLTexture { ...@@ -392,7 +392,7 @@ public extension MTLTexture {
return output return output
} }
func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
// print("origin dim: \(dim)") // print("origin dim: \(dim)")
// print("texture: ") // print("texture: ")
// print(self) // print(self)
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import Foundation import Foundation
let testTo = 61 let testTo = 161
var isTest = false var isTest = false
...@@ -128,18 +128,18 @@ public class Executor<P: PrecisionType> { ...@@ -128,18 +128,18 @@ public class Executor<P: PrecisionType> {
// print(stridableInput) // print(stridableInput)
// let _: Flo? = input.logDesc(header: "input: ", stridable: true) // let _: Flo? = input.logDesc(header: "input: ", stridable: true)
for i in 0..<self.ops.count { // for i in 0..<self.ops.count {
let op = self.ops[i] // let op = self.ops[i]
print(" 第 \(i) 个 op: ") // print(" 第 \(i) 个 op: ")
op.delogOutput() // op.delogOutput()
} // }
// self.ops[59].delogOutput() // self.ops[testTo - 2].delogOutput()
// self.ops[testTo - 1].delogOutput()
// self.ops[60].delogOutput() // self.ops[60].delogOutput()
return // return
let afterDate = Date.init() let afterDate = Date.init()
var resultHolder: ResultHolder<P> var resultHolder: ResultHolder<P>
if except > 0 { if except > 0 {
resultHolder = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures) resultHolder = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
......
...@@ -159,7 +159,7 @@ public class Loader<P: PrecisionType> { ...@@ -159,7 +159,7 @@ public class Loader<P: PrecisionType> {
} catch let error { } catch let error {
throw error throw error
} }
tensor.convert(to: DataLayout.NHWC()) // tensor.convert(to: DataLayout.NHWC())
// tensor.initBuffer(device: device) // tensor.initBuffer(device: device)
scope[varDesc.name] = tensor scope[varDesc.name] = tensor
} else { } else {
......
...@@ -75,12 +75,12 @@ class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P> ...@@ -75,12 +75,12 @@ class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>
// print(targetBoxArray.strideArray()) // print(targetBoxArray.strideArray())
let targetBoxOriginDim = para.targetBox.originDim let targetBoxOriginDim = para.targetBox.originDim
let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]), texturePrecision: computePrecision) let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]))
print(" target box ") print(" target box ")
print(targetBoxArray.strideArray()) print(targetBoxArray.strideArray())
let originDim = para.output.originDim let originDim = para.output.originDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(" output ") print(" output ")
print(outputArray.strideArray()) print(outputArray.strideArray())
} }
......
...@@ -67,10 +67,10 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run ...@@ -67,10 +67,10 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run
print(" \(type) output: ") print(" \(type) output: ")
let originDim = para.output.originDim let originDim = para.output.originDim
if para.output.transpose == [0, 1, 2, 3] { if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] { } else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3])).strideArray())
} else { } else {
fatalError(" not implemet") fatalError(" not implemet")
} }
......
...@@ -94,13 +94,15 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, ...@@ -94,13 +94,15 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
func delogOutput() { func delogOutput() {
print("op \(type): ")
print(" \(type) output: ")
print(" padding: ") print(" padding: ")
print(para.paddings) print(para.paddings)
print("stride: ") print("stride: ")
print(para.stride) print(para.stride)
print("dilations: ") print("dilations: ")
print(para.dilations) print(para.dilations)
print(" \(type) output: ")
print(" para input dim: ") print(" para input dim: ")
print(para.input.dim) print(para.input.dim)
print(" para filter dim: ") print(" para filter dim: ")
...@@ -111,6 +113,14 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, ...@@ -111,6 +113,14 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
let biase: [Float32] = para.y.buffer.array() let biase: [Float32] = para.y.buffer.array()
print(biase) print(biase)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(" - filter - ")
let array: [Float32] = para.filter.buffer.array()
print(array)
print(" - y - ")
let yArray: [Float32] = para.y.buffer.array()
print(yArray)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} }
} }
...@@ -110,7 +110,7 @@ class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluPa ...@@ -110,7 +110,7 @@ class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluPa
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
} }
} }
...@@ -43,13 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr ...@@ -43,13 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
} }
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
let originDim = para.output.originDim let originDim = para.output.originDim
if para.output.transpose == [0, 1, 2, 3] { if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] { } else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
print(output.strideArray())
} else { } else {
print(" not implement") print(" not implement")
} }
......
...@@ -58,6 +58,6 @@ class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runa ...@@ -58,6 +58,6 @@ class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runa
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
} }
} }
...@@ -65,6 +65,6 @@ class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNRelu ...@@ -65,6 +65,6 @@ class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNRelu
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
} }
} }
...@@ -71,12 +71,15 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem ...@@ -71,12 +71,15 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
// print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) // print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
print(" \(type) output: ") print(" \(type) output: ")
print(para.inputY)
let originDim = para.output.originDim let originDim = para.output.originDim
if para.output.transpose == [0, 1, 2, 3] { if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] { } else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} else { } else {
print(" not implement") print(" not implement")
} }
......
...@@ -61,7 +61,7 @@ class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam< ...@@ -61,7 +61,7 @@ class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
} }
} }
...@@ -15,53 +15,60 @@ ...@@ -15,53 +15,60 @@
import Foundation import Foundation
class BatchNormKernel<P: PrecisionType>: Kernel, Computable { class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
var newScale: MTLBuffer var newScale: MTLBuffer
var newBias: MTLBuffer var newBias: MTLBuffer
required init(device: MTLDevice, param: BatchNormParam<P>) {
guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
fatalError()
}
guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
fatalError()
}
self.newScale = newScale
self.newBias = newBias
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "batchnorm")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "batchnorm_half")
} else {
fatalError()
}
let varianceBuffer : MTLBuffer = param.inputVariance.buffer
required init(device: MTLDevice, param: BatchNormParam<P>) { var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else { let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
fatalError() for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
} invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
fatalError()
}
self.newScale = newScale
self.newBias = newBias
super.init(device: device, inFunctionName: "batchnorm")
let varianceBuffer : MTLBuffer = param.inputVariance.buffer
var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
}
let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
let scale : MTLBuffer = param.inputScale.buffer
let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
let bias : MTLBuffer = param.inputBias.buffer
let biasContents = bias.contents().assumingMemoryBound(to: P.self)
let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws { let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
guard let encoder = commandBuffer.makeComputeCommandEncoder() else { let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
throw PaddleMobileError.predictError(message: " encoder is nil") let scale : MTLBuffer = param.inputScale.buffer
} let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
print("BatchNorm compute") let bias : MTLBuffer = param.inputBias.buffer
encoder.setTexture(param.input.metalTexture, index: 0) let biasContents = bias.contents().assumingMemoryBound(to: P.self)
encoder.setTexture(param.output.metalTexture, index: 1) let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
encoder.setBuffer(newScale, offset: 0, index: 0)
encoder.setBuffer(newBias, offset: 0, index: 1) for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
encoder.endEncoding() newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
}
}
func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
} }
print("BatchNorm compute")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBuffer(newScale, offset: 0, index: 0)
encoder.setBuffer(newBias, offset: 0, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
} }
...@@ -49,26 +49,39 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable ...@@ -49,26 +49,39 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) { required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
} else {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
}
param.filter.initBuffer(device: device, precision: computePrecision) param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision) param.y.initBuffer(device: device, precision: computePrecision)
param.variance.initBuffer(device: device, precision: .Float32) param.variance.initBuffer(device: device, precision: .Float32)
param.mean.initBuffer(device: device, precision: .Float32) param.mean.initBuffer(device: device, precision: .Float32)
param.scale.initBuffer(device: device, precision: .Float32) param.scale.initBuffer(device: device, precision: .Float32)
param.bias.initBuffer(device: device, precision: .Float32) param.bias.initBuffer(device: device, precision: .Float32)
if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
} else {
fatalError(" unsupport ")
}
} else if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half")
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half")
} else {
fatalError(" unsupport ")
}
} else {
fatalError()
}
let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1]) let offsetY = param.filter.height/2 - Int(param.paddings[1])
......
...@@ -17,14 +17,23 @@ import Foundation ...@@ -17,14 +17,23 @@ import Foundation
class ConvAddKernel<P: PrecisionType>: Kernel, Computable { class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddParam<P>) { required init(device: MTLDevice, param: ConvAddParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision)
if computePrecision == .Float16 { if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x1_half") super.init(device: device, inFunctionName: "conv_add_1x1_half")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half") super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3_half") super.init(device: device, inFunctionName: "conv_add_3x3_half")
} else if param.filter.width == 1 && param.filter.height == 5 {
super.init(device: device, inFunctionName: "conv_add_5x1_half")
} else if param.filter.width == 5 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x5_half")
} else {
fatalError(" unsupport yet ")
} }
} else if computePrecision == .Float32 { } else if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
...@@ -35,22 +44,21 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -35,22 +44,21 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
super.init(device: device, inFunctionName: "conv_add_5x1") super.init(device: device, inFunctionName: "conv_add_5x1")
} else if param.filter.width == 5 && param.filter.height == 1 { } else if param.filter.width == 5 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x5") super.init(device: device, inFunctionName: "conv_add_1x5")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3") super.init(device: device, inFunctionName: "conv_add_3x3")
} else {
fatalError(" unsupport yet ")
} }
} else { } else {
fatalError() fatalError()
} }
let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision)
print(" function: \(functionName)") print(" function: \(functionName)")
print("offset x: \(offsetX)") print("offset x: \(offsetX)")
print("offset y: \(offsetY)") print("offset y: \(offsetY)")
......
...@@ -49,35 +49,41 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable { ...@@ -49,35 +49,41 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
} }
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvBNReluParam<P>) { required init(device: MTLDevice, param: ConvBNReluParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.variance.initBuffer(device: device, precision: .Float32)
param.mean.initBuffer(device: device, precision: .Float32)
param.scale.initBuffer(device: device, precision: .Float32)
param.bias.initBuffer(device: device, precision: .Float32)
if computePrecision == .Float32 { if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1") super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3") super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3") super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
} else {
fatalError(" unsupport ")
} }
} else if computePrecision == .Float16 { } else if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half") super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half") super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half") super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
} else {
fatalError(" unsupport ")
} }
} else { } else {
fatalError() fatalError()
} }
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.variance.initBuffer(device: device, precision: .Float32)
param.mean.initBuffer(device: device, precision: .Float32)
param.scale.initBuffer(device: device, precision: .Float32)
param.bias.initBuffer(device: device, precision: .Float32)
let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1]) let offsetY = param.filter.height/2 - Int(param.paddings[1])
......
...@@ -27,18 +27,20 @@ public struct MetalConvParam { ...@@ -27,18 +27,20 @@ public struct MetalConvParam {
class ConvKernel<P: PrecisionType>: Kernel, Computable { class ConvKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvParam<P>) { required init(device: MTLDevice, param: ConvParam<P>) {
param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_1x1") super.init(device: device, inFunctionName: "conv_1x1")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_3x3") super.init(device: device, inFunctionName: "depthwise_conv_3x3")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_3x3") super.init(device: device, inFunctionName: "conv_3x3")
} else {
fatalError(" unsupport ")
} }
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
let offsetZ = 0.0 let offsetZ = 0.0
param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
} }
......
...@@ -31,7 +31,27 @@ struct MetalConvTransposeParam { ...@@ -31,7 +31,27 @@ struct MetalConvTransposeParam {
class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
var metalParam: MetalConvTransposeParam! var metalParam: MetalConvTransposeParam!
required init(device: MTLDevice, param: ConvTransposeParam<P>) { required init(device: MTLDevice, param: ConvTransposeParam<P>) {
super.init(device: device, inFunctionName: "conv_transpose") param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true)
if computePrecision == .Float32 {
if param.stride == [2, 2] && param.stride == [2, 2] {
super.init(device: device, inFunctionName: "conv_transpose2x2_stride2")
} else {
fatalError(" -- conv transpose unsupported yet -- ")
}
} else if computePrecision == .Float16 {
if param.stride == [2, 2] && param.stride == [2, 2] {
super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half")
} else {
fatalError(" -- conv transpose unsupported yet -- ")
}
} else {
fatalError()
}
// let filter: [Float32] = param.filter.buffer.array()
// print(" conv transpose filter")
// print(filter)
let kernelWidth = UInt16(param.filter.width) let kernelWidth = UInt16(param.filter.width)
let kernelHeight = UInt16(param.filter.height) let kernelHeight = UInt16(param.filter.height)
...@@ -43,9 +63,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -43,9 +63,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
let dilationY = UInt16(param.dilations[1]) let dilationY = UInt16(param.dilations[1])
metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
param.output.initTexture(device: device, inTranspose: param.input.transpose)
param.filter.initBuffer(device: device)
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import Foundation import Foundation
struct ElementwiseAddMetalParam { struct ElementwiseAddMetalParam {
var unsafe_one_dim: Int32 = 0
var fast: Int32 = 0 var fast: Int32 = 0
var axis: Int32 = 0 var axis: Int32 = 0
var yoff: Int32 = 0 var yoff: Int32 = 0
...@@ -26,8 +27,14 @@ struct ElementwiseAddMetalParam { ...@@ -26,8 +27,14 @@ struct ElementwiseAddMetalParam {
class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable { class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
required init(device: MTLDevice, param: ElementwiseAddParam<P>) { required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
super.init(device: device, inFunctionName: "elementwise_add")
param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "elementwise_add")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "elementwise_add_half")
} else {
fatalError()
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
...@@ -59,6 +66,11 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -59,6 +66,11 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
emp.fast = 1 emp.fast = 1
} }
// TODO:
if param.inputY.tensorDim.cout() == 1 {
emp.unsafe_one_dim = 1;
}
encoder.setBytes(&emp, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0) encoder.setBytes(&emp, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding() encoder.endEncoding()
......
...@@ -27,8 +27,14 @@ struct PoolMetalParam { ...@@ -27,8 +27,14 @@ struct PoolMetalParam {
class PoolKernel<P: PrecisionType>: Kernel, Computable{ class PoolKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: PoolParam<P>) { required init(device: MTLDevice, param: PoolParam<P>) {
super.init(device: device, inFunctionName: "pool")
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "pool")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "pool_half")
} else {
fatalError()
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
......
...@@ -10,15 +10,27 @@ import Foundation ...@@ -10,15 +10,27 @@ import Foundation
class PreluKernel<P: PrecisionType>: Kernel, Computable{ class PreluKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: PreluParam<P>) { required init(device: MTLDevice, param: PreluParam<P>) {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "prelu_channel")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "prelu_element")
} else {
super.init(device: device, inFunctionName: "prelu_other")
}
param.alpha.initBuffer(device: device, precision: computePrecision) param.alpha.initBuffer(device: device, precision: computePrecision)
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "prelu_channel")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "prelu_element")
} else {
super.init(device: device, inFunctionName: "prelu_other")
}
} else if computePrecision == .Float16 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "prelu_channel_half")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "prelu_element_half")
} else {
super.init(device: device, inFunctionName: "prelu_other_half")
}
} else {
fatalError()
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
......
...@@ -33,6 +33,10 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -33,6 +33,10 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
var metalParam: PriorBoxMetalParam! var metalParam: PriorBoxMetalParam!
required init(device: MTLDevice, param: PriorBoxParam<P>) { required init(device: MTLDevice, param: PriorBoxParam<P>) {
param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
if computePrecision == .Float32 { if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "prior_box") super.init(device: device, inFunctionName: "prior_box")
} else if computePrecision == .Float16 { } else if computePrecision == .Float16 {
...@@ -41,9 +45,6 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -41,9 +45,6 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
fatalError() fatalError()
} }
param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
let n = 1 let n = 1
let h = param.output.dim[1] let h = param.output.dim[1]
let w = param.output.dim[2] let w = param.output.dim[2]
......
...@@ -15,17 +15,23 @@ ...@@ -15,17 +15,23 @@
import Foundation import Foundation
class ReluKernel<P: PrecisionType>: Kernel, Computable{ class ReluKernel<P: PrecisionType>: Kernel, Computable{
func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else { guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil") throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
} }
encoder.setTexture(param.input.metalTexture, index: 0)
required init(device: MTLDevice, param: ReluParam<P>) { encoder.setTexture(param.output.metalTexture, index: 1)
super.init(device: device, inFunctionName: "relu") encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
required init(device: MTLDevice, param: ReluParam<P>) {
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "relu")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "relu_half")
} else {
fatalError()
} }
}
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
//
//import Foundation
//import MetalPerformanceShaders
//
//
//struct ResizeParam: OpParam{
// typealias OutputType = <#type#>
//
// typealias ParamPrecisionType = <#type#>
//
// let input: MTLTexture
// let output: MTLTexture
// let expectDim: Dim
//}
//
//struct OutputDim {
// let width: UInt16
// let height: UInt16
// let strideX: UInt16
// let strideY: UInt16
//}
//
//class ResizeKernel<P: PrecisionType>: Kernel, Computable{
// var lanczos: MPSImageLanczosScale
// required init(device: MTLDevice, param: ResizeParam) {
// lanczos = MPSImageLanczosScale.init(device: device)
// super.init(device: device, inFunctionName: "resize")
// }
// func compute(commandBuffer: MTLCommandBuffer, param: ResizeParam) throws {
//// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
//// throw PaddleMobileError.predictError(message: " encode is nil")
//// }
// lanczos.encode(commandBuffer: commandBuffer, sourceTexture: param.input, destinationTexture: param.output)
//
//// encoder.setTexture(param.input, index: 0)
//// encoder.setTexture(param.output, index: 1)
//// let strideX = param.input.width/param.expectDim[2]
//// let strideY = param.input.height/param.expectDim[1]
//// var outputDim = OutputDim.init(width: UInt16(param.expectDim[1]), height: UInt16(param.expectDim[2]), strideX: UInt16(strideX), strideY: UInt16(strideY))
//// encoder.setBytes(&outputDim, length: MemoryLayout<OutputDim>.size, index: 0)
//// encoder.dispatch(computePipline: pipline, outTexture: param.output)
//// encoder.endEncoding()
// }
//
//
//
//
//}
...@@ -429,7 +429,122 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in ...@@ -429,7 +429,122 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in
} }
kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
ushort dilation_y = param.dilationY;
half4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
for (int j = 0; j < 5; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
output = output + float4(biase[gid.z]);
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
ushort dilation_x = param.dilationX;
half4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
for (int j = 0; j < 5; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
output = output + float4(biase[gid.z]);
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
...@@ -502,3 +617,6 @@ kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [ ...@@ -502,3 +617,6 @@ kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [
// output = output + biase[gid.z]; // output = output + biase[gid.z];
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
...@@ -148,4 +148,133 @@ kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture( ...@@ -148,4 +148,133 @@ kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(
} }
kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(float4(input), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(float4(input), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(float4(input), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(float4(input), float4(weight_w));
}
outTexture.write(half4(output), gid.xy, gid.z);
}
...@@ -29,11 +29,11 @@ struct MetalConvTransposeParam{ ...@@ -29,11 +29,11 @@ struct MetalConvTransposeParam{
ushort dilationY; ushort dilationY;
}; };
kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvTransposeParam &param [[buffer(0)]], constant MetalConvTransposeParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]){ uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -41,48 +41,134 @@ kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[te ...@@ -41,48 +41,134 @@ kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[te
} }
int input_array_size = inTexture.get_array_size(); int input_array_size = inTexture.get_array_size();
int kernel_index_x = gid.x % 2;
uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH; int kernel_index_y = gid.y % 2;
int kernel_index = kernel_index_y * 2 + kernel_index_x;
uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice); int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
int input_x = gid.x / 2;
int input_y = gid.y / 2;
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
float4 output = float4(0.0);
for (int i = 0; i < input_array_size; ++i) {
float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
output.x += dot(input, kernel_slice0);
output.y += dot(input, kernel_slice1);
output.z += dot(input, kernel_slice2);
output.w += dot(input, kernel_slice3);
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvTransposeParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
float4 output; int input_array_size = inTexture.get_array_size();
int kernel_index_x = gid.x % 2;
int kernel_index_y = gid.y % 2;
int kernel_index = kernel_index_y * 2 + kernel_index_x;
int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
int input_x = gid.x / 2;
int input_y = gid.y / 2;
for (int w = 0; w < param.kernelW; ++w) { constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX; float4 output = float4(0.0);
if (input_x < 0 || input_x >= int(inTexture.get_width())) { for (int i = 0; i < input_array_size; ++i) {
continue;
}
for (int h = 0; h < param.kernelH; ++h) { half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY;
if (input_y < 0 || input_y >= int(inTexture.get_height())) { half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
continue; half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
} half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
output.x += dot(float4(input), float4(kernel_slice0));
for (int slice = 0; slice < input_array_size; ++slice) {
output.y += dot(float4(input), float4(kernel_slice1));
float4 input;
float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice]; output.z += dot(float4(input), float4(kernel_slice2));
float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
output.w += dot(float4(input), float4(kernel_slice3));
float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
input = inTexture.sample(sample, float2(input_x, input_x), slice);
output.x += dot(input, kernel_slice);
output.x += dot(input, kernel_slice1);
output.x += dot(input, kernel_slice2);
output.x += dot(input, kernel_slice3);
}
}
} }
outTexture.write(output, gid.xy, gid.z); outTexture.write(half4(output), gid.xy, gid.z);
} }
//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
// texture2d_array<float, access::write> outTexture [[texture(1)]],
// constant MetalConvTransposeParam &param [[buffer(0)]],
// const device float4 *weights [[buffer(1)]],
// uint3 gid [[thread_position_in_grid]]){
// if (gid.x >= outTexture.get_width() ||
// gid.y >= outTexture.get_height() ||
// gid.z >= outTexture.get_array_size()) {
// return;
// }
//
// int input_array_size = inTexture.get_array_size();
//
// uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
//
// uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
//
// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
//
// float4 output;
//
// for (int w = 0; w < param.kernelW; ++w) {
// int top = gid.x - w * param.dilationX + param.paddingX;
// int input_x = top / param.strideX;
// if (top < 0 || input_x >= int(inTexture.get_width())) {
// continue;
// }
//
// for (int h = 0; h < param.kernelH; ++h) {
// int top_y = gid.y - h * param.dilationY + param.paddingY;
// int input_y = top_y / param.strideY;
// if (top_y < 0 || input_y >= int(inTexture.get_height())) {
// continue;
// }
//
// uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
//
// for (int slice = 0; slice < input_array_size; ++slice) {
//
// float4 input;
// float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
// float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
//
// float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
//
// float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
//
// input = inTexture.sample(sample, float2(input_x, input_y), slice);
// output.x += dot(input, kernel_slice);
// output.y += dot(input, kernel_slice1);
// output.z += dot(input, kernel_slice2);
// output.w += dot(input, kernel_slice3);
// }
// }
// }
//
// outTexture.write(output, gid.xy, gid.z);
//}
//
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
using namespace metal; using namespace metal;
struct ElementwiseAddParam { struct ElementwiseAddParam {
int32_t unsafe_one_dim;
int32_t fast; int32_t fast;
int32_t axis; int32_t axis;
int32_t yoff; int32_t yoff;
...@@ -36,7 +37,10 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur ...@@ -36,7 +37,10 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return; gid.z >= outTexture.get_array_size()) return;
float4 rx, ry; float4 rx, ry;
if (pm.fast == 1) { if (pm.unsafe_one_dim == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(uint2(0, 0), gid.z);
} else if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z); rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z); ry = inputY.read(gid.xy, gid.z);
} else { } else {
...@@ -59,3 +63,39 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur ...@@ -59,3 +63,39 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
float4 r = rx + ry; float4 r = rx + ry;
outTexture.write(r, gid.xy, gid.z); outTexture.write(r, gid.xy, gid.z);
} }
kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
texture2d_array<half, access::read> inputY [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant ElementwiseAddParam &pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
half4 rx, ry;
if (pm.unsafe_one_dim == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(uint2(0, 0), gid.z);
} else if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z);
} else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {1, 1, 1, 1}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
for (int n = 0; n < 4; n++) {
xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
invtrans(xtrans, x_abcd, t_abcd);
for (int k = pm.axis; k < (4 - pm.yoff); k++) {
y_abcd[k+pm.yoff] = t_abcd[k];
}
trans(ytrans, y_abcd, t_abcd);
abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
}
}
half4 r = rx + ry;
outTexture.write(r, gid.xy, gid.z);
}
...@@ -81,3 +81,71 @@ kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[textu ...@@ -81,3 +81,71 @@ kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[textu
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 *alpha [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
half4 alpha_value = alpha[gid.z];
half4 output;
output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
outTexture.write(output, gid.xy, gid.z);
}
kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 *alpha [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
half4 alpha_value = alpha[alpha_to + gid.z];
half4 output;
output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
outTexture.write(output, gid.xy, gid.z);
}
kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half *alpha [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
half alpha_value = alpha[0];
half4 output;
output.x = input.x > 0 ? input.x : (alpha_value * input.x);
output.y = input.y > 0 ? input.y : (alpha_value * input.y);
output.z = input.z > 0 ? input.z : (alpha_value * input.z);
output.w = input.w > 0 ? input.w : (alpha_value * input.w);
outTexture.write(output, gid.xy, gid.z);
}
...@@ -60,7 +60,7 @@ class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, ...@@ -60,7 +60,7 @@ class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable,
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
// print("pool2d delog") // print("pool2d delog")
......
...@@ -51,13 +51,13 @@ class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runabl ...@@ -51,13 +51,13 @@ class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runabl
func delogOutput() { func delogOutput() {
print(" \(type) input: ") print(" \(type) input: ")
print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3])).strideArray())
print(" \(type) Alpha: ") print(" \(type) Alpha: ")
let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
} }
// print("softmax delog") // print("softmax delog")
......
...@@ -46,7 +46,7 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, ...@@ -46,7 +46,7 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable,
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} }
} }
......
...@@ -76,7 +76,7 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, ...@@ -76,7 +76,7 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
let originDim = para.output.originDim let originDim = para.output.originDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} }
......
...@@ -54,7 +54,7 @@ class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, ...@@ -54,7 +54,7 @@ class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>,
print("softmax delog") print("softmax delog")
let originDim = para.output.originDim let originDim = para.output.originDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} }
} }
...@@ -95,7 +95,28 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -95,7 +95,28 @@ class Tensor<P: PrecisionType>: Tensorial {
func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16) { func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) {
if convertToNHWC {
// print(layout)
convert(to: DataLayout.NHWC())
}
if withTranspose {
let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
let n = dim[0]
let hwc = numel()/n
for j in 0..<hwc {
for i in 0..<n {
//data[i * hwc + j]
transposePointer[j * n + i] = data[i * hwc + j]
}
}
dim.swapeDimAt(index1: 0, index2: 3)
data.release()
data.pointer = transposePointer
}
guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else { guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
fatalError(" not support yet ") fatalError(" not support yet ")
} }
...@@ -139,6 +160,8 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -139,6 +160,8 @@ class Tensor<P: PrecisionType>: Tensorial {
for j in 0..<paddedC { for j in 0..<paddedC {
if j < C { if j < C {
dstPtr[j] = tmpPointer[j] dstPtr[j] = tmpPointer[j]
} else {
dstPtr[j] = 0
} }
} }
tmpPointer += C tmpPointer += C
...@@ -152,6 +175,47 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -152,6 +175,47 @@ class Tensor<P: PrecisionType>: Tensorial {
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
} }
convertedPointer.deinitialize(count: count)
convertedPointer.deallocate()
}
} else {
let C = dim[3]
let cSlices = (C + 3) / 4
let paddedC = cSlices * 4
let count = paddedC * dim[0] * dim[1] * dim[2]
if C == paddedC {
buffer = device.makeBuffer(length: count * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
}
} else if C == 1 {
fatalError(" not support ")
} else {
buffer = device.makeBuffer(length: count * precisionSize)
let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
var tmpPointer = floatPointer
var dstPtr = convertedPointer
for _ in 0..<dim[0] * dim[1] * dim[2] {
for j in 0..<paddedC {
if j < C {
dstPtr[j] = tmpPointer[j]
} else {
dstPtr[j] = 0
}
}
tmpPointer += C
dstPtr += paddedC
}
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
}
convertedPointer.deinitialize(count: count) convertedPointer.deinitialize(count: count)
convertedPointer.deallocate() convertedPointer.deallocate()
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册