";
@@ -478,6 +493,7 @@
files = (
FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
+ FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */,
4AF9287921341661005B6C3A /* Softmax.metal in Sources */,
FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
@@ -493,12 +509,15 @@
FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
+ FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
+ FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */,
FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */,
FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
+ FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */,
FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */,
FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */,
@@ -550,6 +569,7 @@
FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
+ FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */,
FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */,
diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
index e250c37c49d642ce3a4aa5610c4d0dbf3a9ce9fb..2b7636ce00ba2ec56aabd7e732039973ee8efdb3 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -225,16 +225,12 @@ extension MTLComputeCommandEncoder {
let groupDepth = slices
let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
-// print("groups: \(groups) ")
-// print("threads per group: \(threadsPerGroup)")
-
setComputePipelineState(computePipline)
dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
}
}
-
public extension MTLTexture {
func stridableFloatArray(stridable: Bool = true) -> [(index: Int, value: P)] {
@@ -285,6 +281,23 @@ public extension MTLTexture {
return fArr
}
+ func float32Array() -> [Float32] {
+ if pixelFormat == .rgba32Float {
+ let float32Array = floatArray { (f: Float32) -> Float32 in
+ return f
+ }
+ return float32Array
+ } else if pixelFormat == .rgba16Float {
+
+ var float16Array = floatArray { (f: Float16) -> Float16 in
+ return f
+ }
+ return float16To32(input: &float16Array, count: float16Array.count)
+ } else {
+ fatalError()
+ }
+ }
+
func logDesc(header: String = "", stridable: Bool = true) -> T? {
print(header)
print("texture: \(self)")
@@ -341,14 +354,27 @@ public extension MTLTexture {
}
// n c h w - dim
- func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
+ func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] {
// print("origin dim: \(dim)")
print("texture: ")
print(self)
+ var textureArray: [Float32]
+// if texturePrecision == .Float16
- let textureArray = floatArray { (i : Float32) -> Float32 in
- return i
+ if pixelFormat == .rgba32Float {
+ textureArray = floatArray { (i : Float32) -> Float32 in
+ return i
+ }
+ } else if pixelFormat == .rgba16Float {
+
+ var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+ return i
+ }
+ textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+ } else {
+ fatalError(" 目前还不支持其他类型 ")
}
+
var output: [Float32] = []
for s in 0.. [Float32] {
+ func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] {
// print("origin dim: \(dim)")
// print("texture: ")
// print(self)
- let textureArray = floatArray { (i : Float32) -> Float32 in
- return i
+ var textureArray: [Float32]
+ if pixelFormat == .rgba32Float {
+ textureArray = floatArray { (i : Float32) -> Float32 in
+ return i
+ }
+ } else if pixelFormat == .rgba16Float {
+ var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+ return i
+ }
+ textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+ } else {
+ fatalError(" 目前还不支持其他类型 ")
}
+
var output: [Float32] = []
-
let numOfASlice = dim.h * dim.w * 4
for h in 0.. Self
@@ -78,6 +79,28 @@ extension Float32: PrecisionType {
}
}
+public func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) {
+ var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4)
+ var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+ guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+ fatalError(" float 32 to float 16 error ! ")
+ }
+}
+
+public func float16To32(input: UnsafeMutablePointer, count: Int) -> [Float32] {
+ var output = Array.init(repeating: 0.0, count: count)
+ float16to32(input: input, output: &output, count: count)
+ return output
+}
+
+public func float16to32(input: UnsafeMutablePointer, output: UnsafeMutablePointer, count: Int) {
+ var bufferFloat16 = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 2)
+ var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
+ if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
+ fatalError(" convert float16 to float32 error")
+ }
+}
+
// N - 0 C - 1 H - 2 W - 3
struct DataLayout {
diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift
index 3296d106569048c6de673c9abfa4e5e1fc0ec79a..47769527d355fae1fde4fd2c4e82631df8d6bf04 100644
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -16,6 +16,8 @@ import Foundation
let testTo = 54
+let computePrecision: ComputePrecision = .Float32
+
public class ResultHolder {
public let dim: [Int]
public let resultArr: [P]
@@ -66,7 +68,6 @@ public class Executor {
let op = block.ops[i]
do {
let op = try OpCreator.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-// op.inferShape()
ops.append(op)
} catch let error {
throw error
@@ -110,16 +111,12 @@ public class Executor {
}
buffer.addCompletedHandler { (commandbuffer) in
-
+
// let inputArr = resInput.floatArray(res: { (p:P) -> P in
// return p
// })
// print(inputArr.strideArray())
//
-// let inputArr = resInput.floatArray(res: { (p:P) -> P in
-// return p
-// })
-//
// writeToLibrary(fileName: "genet_input_hand", array: inputArr)
// print("write to library done")
// return
@@ -134,7 +131,7 @@ public class Executor {
print(" 第 \(i) 个 op: ")
op.delogOutput()
}
-//
+
// return
let afterDate = Date.init()
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
index e7d118afe83a200a464d191c8d391df128fa3936..193a271ccfc3ea3a68429f227394552c9f609f6f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -58,25 +58,29 @@ class BoxcoderOp: Operator, BoxcoderParam
}
func delogOutput() {
-
print(" \(type) output: ")
- let priorBoxOriginDim = para.priorBox.originDim
- let priorBoxArray = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3]))
- print(" prior box ")
- print(priorBoxArray.strideArray())
-
- let priorBoxVarOriginDim = para.priorBoxVar.originDim
- let priorBoxVarArray = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3]))
- print(" prior box var ")
- print(priorBoxVarArray.strideArray())
+// let priorBoxOriginDim = para.priorBox.originDim
+// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3]))
+// print(" prior box ")
+// print(priorBoxArray.strideArray())
+//
+// let priorBoxVarOriginDim = para.priorBoxVar.originDim
+// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3]))
+// print(" prior box var ")
+// print(priorBoxVarArray.strideArray())
+//
+// let targetBoxOriginDim = para.targetBox.originDim
+// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]))
+// print(" target box ")
+// print(targetBoxArray.strideArray())
let targetBoxOriginDim = para.targetBox.originDim
- let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]))
+ let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]), texturePrecision: computePrecision)
print(" target box ")
print(targetBoxArray.strideArray())
let originDim = para.output.originDim
- let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
print(" output ")
print(outputArray.strideArray())
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
index 84e58a0f3d537ed96d24744fc3803d0885375e14..67e564dddd0aadcb059767380e7946040e8cd284 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -65,17 +65,17 @@ class ConcatOp: Operator, ConcatParam>, Run
func delogOutput() {
print(" \(type) output: ")
-
let originDim = para.output.originDim
if para.output.transpose == [0, 1, 2, 3] {
- let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray())
} else {
- fatalError()
+ fatalError(" not implemet")
}
}
+
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
index 8bb9349be7bd054a5e00005131b9baf1afa370d5..7bced214bd11bfef61eb405d59073f004e765e03 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -125,13 +125,6 @@ class ConvAddBatchNormReluOp: Operator P in
-// return p
-// }
-// //
-// writeToLibrary(fileName: "output_112x112x32_2", array: output)
-// print(" write done")
-//
-// // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+ // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
index c42e5fa1d8a5de54c4ab4d251097eb876411a350..5e344014188061c3dbb411226b2655a3bc2659b8 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -46,9 +46,6 @@ class ConvAddParam: OpParam {
class ConvAddOp: Operator, ConvAddParam>, Runable, Creator, InferShaperable, Fusion{
typealias OpType = ConvAddOp
-
-
-
static func fusionNode() -> Node {
let beginNode = Node.init(inType: gConvType)
_ = beginNode
@@ -64,7 +61,6 @@ class ConvAddOp: Operator, ConvAddParam>,
return gConvAddType
}
-
func inferShape() {
let inDims = para.input.dim
@@ -101,10 +97,8 @@ class ConvAddOp: Operator, ConvAddParam>,
print(para.stride)
print("dilations: ")
print(para.dilations)
-
-
-
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
index 3c521a2210614550577369c603dbbdc5e2cb6692..be8c57d3ace01dabd652e0e80a43c5a053213e28 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
@@ -110,7 +110,7 @@ class ConvBNReluOp: Operator, ConvBNReluPa
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
index ba83de1bf892527737a3a1447352877919c8f18e..387fa420b68f8004a12af85ca398cf306f41a5c6 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
@@ -46,10 +46,10 @@ class ConvTransposeOp: Operator, ConvTr
print(" \(type) output: ")
let originDim = para.output.originDim
if para.output.transpose == [0, 1, 2, 3] {
- let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
} else {
print(" not implement")
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
index 639c22ce12c7a110cf58f3f9e7b9ee458d393260..36f477bc1cb48007b5b28bf27a7424940918025b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
@@ -58,6 +58,6 @@ class DepthConvOp: Operator, ConvParam>, Runa
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
index 16a42d5c7b24e7b3a26cab35f68decd226076876..0ea8a62c5c0bf30da200add2a96410136d2f40fb 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
@@ -65,6 +65,6 @@ class DwConvBNReluOp: Operator, ConvBNRelu
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
index 4812f051820385727e08ad79f40f7820bb3310f5..0f96b204d59f3d4a0dd0fae20340811855421c95 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -56,31 +56,30 @@ class ElementwiseAddOp: Operator, Elem
// para.output.dim = para.input.dim
}
+ func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+ do {
+ try kernel.compute(commandBuffer: buffer, param: para)
+ } catch let error {
+ throw error
+ }
+ }
+
func delogOutput() {
- print(" \(type) inputX: ")
- print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
- print(" \(type) inputY: ")
- print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
+// print(" \(type) inputX: ")
+// print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
+// print(" \(type) inputY: ")
+// print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
print(" \(type) output: ")
let originDim = para.output.originDim
if para.output.transpose == [0, 1, 2, 3] {
- let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
} else {
print(" not implement")
}
-
- }
-
- func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
- do {
- try kernel.compute(commandBuffer: buffer, param: para)
- } catch let error {
- throw error
- }
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
index 93560582166c53b6de308e53b1cda431151ad741..b6075a807d1637c554587ea99724d6ff1f38e7e6 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
@@ -61,7 +61,7 @@ class FeedOp: Operator, FeedParam<
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
index 530fb8a32b1aa97b6a61ed6f5f2d8a77f453a384..f58358761f820809685510fa4e9b5ff237567b3c 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
@@ -19,68 +19,76 @@ public protocol TestParam {
}
public protocol Testable {
- associatedtype TestParamType: TestParam
- func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
- init(device: MTLDevice, testParam: TestParamType)
+ associatedtype TestParamType: TestParam
+ func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
+ init(device: MTLDevice, testParam: TestParamType)
}
protocol Computable {
- associatedtype ParamType: OpParam
- func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
- init(device: MTLDevice, param: ParamType)
+ associatedtype ParamType: OpParam
+ func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
+ init(device: MTLDevice, param: ParamType)
}
protocol KernelProtocol {
- var pipline: MTLComputePipelineState { get set }
- var functionName: String { get set }
-
+ var pipline: MTLComputePipelineState { get set }
+ var functionName: String { get set }
+
}
open class Kernel {
- let pipline: MTLComputePipelineState
- let functionName: String
- public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
- pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
- functionName = inFunctionName
- }
+ let pipline: MTLComputePipelineState
+ let functionName: String
+ public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
+ pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
+ functionName = inFunctionName
+ }
}
open class CusomKernel: Kernel {
- public struct Shape {
- public let width: Int
- public let height: Int
- public let channel: Int
- public init(inWidth: Int, inHeight: Int, inChannel: Int){
- width = inWidth
- height = inHeight
- channel = inChannel
- }
- }
- public let outputTexture: MTLTexture
- public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
- let textureDesc = MTLTextureDescriptor.init()
- textureDesc.textureType = .type2D
- textureDesc.width = outputDim.width
- textureDesc.height = outputDim.height
- textureDesc.depth = (outputDim.channel + 3) / 4
- textureDesc.pixelFormat = .rgba32Float
- textureDesc.usage = [.shaderRead, .shaderWrite]
- textureDesc.storageMode = .shared
- outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-
- super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+ public struct Shape {
+ public let width: Int
+ public let height: Int
+ public let channel: Int
+ public init(inWidth: Int, inHeight: Int, inChannel: Int){
+ width = inWidth
+ height = inHeight
+ channel = inChannel
}
+ }
+ public let outputTexture: MTLTexture
+ public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
+ let textureDesc = MTLTextureDescriptor.init()
+ textureDesc.textureType = .type2D
+ textureDesc.width = outputDim.width
+ textureDesc.height = outputDim.height
+ textureDesc.depth = (outputDim.channel + 3) / 4
- public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- throw PaddleMobileError.predictError(message: " encode is nil")
- }
- encoder.setTexture(inputTexuture, index: 0)
- encoder.setTexture(outputTexture, index: 1)
- encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
- encoder.endEncoding()
+ if computePrecision == .Float16 {
+ textureDesc.pixelFormat = .rgba16Float
+ } else if computePrecision == .Float32 {
+ textureDesc.pixelFormat = .rgba32Float
+ } else {
+ fatalError()
}
+ textureDesc.usage = [.shaderRead, .shaderWrite]
+ textureDesc.storageMode = .shared
+ outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+
+ super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+ }
+
+ public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ throw PaddleMobileError.predictError(message: " encode is nil")
+ }
+ encoder.setTexture(inputTexuture, index: 0)
+ encoder.setTexture(outputTexture, index: 1)
+ encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+ encoder.endEncoding()
+ }
+
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
index 722ab6b64c953c1fef28082f75794d9e581251ef..939f5db5f192082470ea2ad8773db95af22ffed4 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -18,22 +18,29 @@ struct BoxcoderMetalParam {
}
class BoxcoderKernel: Kernel, Computable{
- func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam) throws {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- throw PaddleMobileError.predictError(message: " encode is nil")
- }
- encoder.setTexture(param.priorBox.metalTexture, index: 0)
- encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
- encoder.setTexture(param.targetBox.metalTexture, index: 2)
- encoder.setTexture(param.output.metalTexture, index: 3)
- var bmp = BoxcoderMetalParam.init()
- encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0)
- encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
- encoder.endEncoding()
+ func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam) throws {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ throw PaddleMobileError.predictError(message: " encode is nil")
}
-
- required init(device: MTLDevice, param: BoxcoderParam
) {
- param.output.initTexture(device: device)
- super.init(device: device, inFunctionName: "boxcoder")
+ encoder.setTexture(param.priorBox.metalTexture, index: 0)
+ encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
+ encoder.setTexture(param.targetBox.metalTexture, index: 2)
+ encoder.setTexture(param.output.metalTexture, index: 3)
+ var bmp = BoxcoderMetalParam.init()
+ encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0)
+ encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+ encoder.endEncoding()
+ }
+
+ required init(device: MTLDevice, param: BoxcoderParam) {
+ param.output.initTexture(device: device, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "boxcoder")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "boxcoder_half")
+ } else {
+ fatalError()
}
+ }
+
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
index 60f1437e7fabf0ae088b41f37cc01e2981cbf236..644476ad9dbb471786611fe25a30ed9c4833edbd 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
@@ -121,8 +121,14 @@ class ConcatKernel: Kernel, Computable{
}
required init(device: MTLDevice, param: ConcatParam) {
- param.output.initTexture(device: device, inTranspose: param.transpose)
- super.init(device: device, inFunctionName: "concat")
+ param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "concat")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "concat_half")
+ } else {
+ fatalError()
+ }
}
required init(device: MTLDevice, testParam: ConcatTestParam) {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index eabadc9d44e7b98fccb0f87e73dd2ffd8da931d7..092207cfb7b9fda63cd6b5aa7082640bae515149 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -15,127 +15,155 @@
import Foundation
struct ConvAddBatchNormReluTestParam: TestParam {
- let inputTexture: MTLTexture
- let outputTexture: MTLTexture
- var metalParam: MetalConvParam
- let filterBuffer: MTLBuffer
- let biaseBuffer: MTLBuffer
- let newScaleBuffer: MTLBuffer
- let newBiaseBuffer: MTLBuffer
- let filterSize: (width: Int, height: Int, channel: Int)
- init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
- inputTexture = inInputTexture
- outputTexture = inOutputTexture
- metalParam = inMetalParam
- filterBuffer = inFilterBuffer
- biaseBuffer = inBiaseBuffer
- newScaleBuffer = inNewScaleBuffer
- newBiaseBuffer = inNewBiaseBuffer
- filterSize = inFilterSize
- }
+ let inputTexture: MTLTexture
+ let outputTexture: MTLTexture
+ var metalParam: MetalConvParam
+ let filterBuffer: MTLBuffer
+ let biaseBuffer: MTLBuffer
+ let newScaleBuffer: MTLBuffer
+ let newBiaseBuffer: MTLBuffer
+ let filterSize: (width: Int, height: Int, channel: Int)
+ init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+ inputTexture = inInputTexture
+ outputTexture = inOutputTexture
+ metalParam = inMetalParam
+ filterBuffer = inFilterBuffer
+ biaseBuffer = inBiaseBuffer
+ newScaleBuffer = inNewScaleBuffer
+ newBiaseBuffer = inNewBiaseBuffer
+ filterSize = inFilterSize
+ }
}
class ConvAddBatchNormReluKernel: Kernel, Computable, Testable {
- required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
- if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
- super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
- } else if testParam.filterSize.channel == 1 {
- super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
- } else {
- super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
- }
+ required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
+ if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+ } else if testParam.filterSize.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+ } else {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
}
+ }
+
+ var metalParam: MetalConvParam!
+
+ required init(device: MTLDevice, param: ConvAddBatchNormReluParam) {
- var metalParam: MetalConvParam!
-
- required init(device: MTLDevice, param: ConvAddBatchNormReluParam
) {
-
- param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
-
- if param.filter.width == 1 && param.filter.height == 1 {
- super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
- } else if param.filter.channel == 1 {
- super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
- } else {
- super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
- }
-
- param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
- param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
- param.variance.initBuffer(device: device)
- param.mean.initBuffer(device: device)
- param.scale.initBuffer(device: device)
- param.bias.initBuffer(device: device)
-
-
- let offsetX = param.filter.width/2 - Int(param.paddings[0])
- let offsetY = param.filter.height/2 - Int(param.paddings[1])
-
- print("offset x: \(offsetX)")
- print("offset y: \(offsetY)")
-
- let offsetZ = 0.0
- metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-
- var invs: [P] = []
- let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-
- for i in 0...stride {
- let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
- invs.append(P(inv))
- }
-
- let newScale: UnsafeMutablePointer = UnsafeMutablePointer
.allocate(capacity: param.scale.buffer.length)
- let newBiase: UnsafeMutablePointer
= UnsafeMutablePointer
.allocate(capacity: param.bias.buffer.length)
-
- let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
- let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
- let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
- for i in 0...stride {
- newScale[i] = invs[i] * scaleContents[i]
- newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
- }
- param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
- param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
-
- newScale.deinitialize(count: param.scale.buffer.length)
- newScale.deallocate()
-
- newBiase.deinitialize(count: param.bias.buffer.length)
- newBiase.deallocate()
+ param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+
+ if param.filter.width == 1 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+ } else if param.filter.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+ } else {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
}
- func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam) throws {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- throw PaddleMobileError.predictError(message: " encode is nil")
- }
+ param.filter.initBuffer(device: device, precision: computePrecision)
+
+ param.y.initBuffer(device: device, precision: computePrecision)
+
+ param.variance.initBuffer(device: device, precision: .Float32)
+ param.mean.initBuffer(device: device, precision: .Float32)
+ param.scale.initBuffer(device: device, precision: .Float32)
+ param.bias.initBuffer(device: device, precision: .Float32)
+
+
+ let offsetX = param.filter.width/2 - Int(param.paddings[0])
+ let offsetY = param.filter.height/2 - Int(param.paddings[1])
+
+ print("offset x: \(offsetX)")
+ print("offset y: \(offsetY)")
+
+ let offsetZ = 0.0
+ metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+
+ var invs: [P] = []
+ let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+
+ for i in 0...stride {
+ let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+ invs.append(P(inv))
+ }
+
+ let newScale: UnsafeMutablePointer = UnsafeMutablePointer
.allocate(capacity: param.scale.buffer.length)
+ let newBiase: UnsafeMutablePointer
= UnsafeMutablePointer
.allocate(capacity: param.bias.buffer.length)
+
+ let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+ let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+ let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+ for i in 0...stride {
+ newScale[i] = invs[i] * scaleContents[i]
+ newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+ }
+
+// var newScaleFP16: UnsafeMutableRawPointer
+//
+// float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout.size)
+
+
+// let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
+
+ var newBiaseBuffer: MTLBuffer
+ var newScaleBuffer: MTLBuffer
+
+ if computePrecision == .Float16 {
+ newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+ newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+ } else if computePrecision == .Float32 {
-
- encoder.setTexture(param.input.metalTexture, index: 0)
- encoder.setTexture(param.output.metalTexture, index: 1)
- encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0)
- encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
- encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
- encoder.setBuffer(param.newScale!, offset: 0, index: 3)
- encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
- encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
- encoder.endEncoding()
+ newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+ newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+
+ float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout.size)
+
+ float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout.size)
+ } else {
+ fatalError(" unsupport ")
}
- public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- fatalError()
- }
-
- encoder.setTexture(param.inputTexture, index: 0)
- encoder.setTexture(param.outputTexture, index: 1)
- var inMetalParam = param.metalParam
- encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0)
- encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
- encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
- encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
- encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
- encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
- encoder.endEncoding()
+ param.newBiase = newBiaseBuffer
+ param.newScale = newScaleBuffer
+
+ newScale.deinitialize(count: param.scale.buffer.length)
+ newScale.deallocate()
+
+ newBiase.deinitialize(count: param.bias.buffer.length)
+ newBiase.deallocate()
+ }
+
+ func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam) throws {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ throw PaddleMobileError.predictError(message: " encode is nil")
}
+
+
+ encoder.setTexture(param.input.metalTexture, index: 0)
+ encoder.setTexture(param.output.metalTexture, index: 1)
+ encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0)
+ encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+ encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+ encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+ encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+ encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+ encoder.endEncoding()
+ }
+
+ public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ fatalError()
+ }
+
+ encoder.setTexture(param.inputTexture, index: 0)
+ encoder.setTexture(param.outputTexture, index: 1)
+ var inMetalParam = param.metalParam
+ encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0)
+ encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+ encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+ encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+ encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+ encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+ encoder.endEncoding()
+ }
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
index 83dd4f996ab23a94824deb6194241d6a52ace487..ce1e0f6560e9911e862ead537089d37fdb4fe1c4 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -17,22 +17,35 @@ import Foundation
class ConvAddKernel: Kernel, Computable {
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddParam) {
- if param.filter.width == 1 && param.filter.height == 1 {
- super.init(device: device, inFunctionName: "conv_add_1x1")
- } else if param.filter.channel == 1 {
- super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+
+ if computePrecision == .Float16 {
+ if param.filter.width == 1 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_add_1x1_half")
+ } else if param.filter.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
+ } else {
+ super.init(device: device, inFunctionName: "conv_add_3x3_half")
+ }
+ } else if computePrecision == .Float32 {
+ if param.filter.width == 1 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_add_1x1")
+ } else if param.filter.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+ } else {
+ super.init(device: device, inFunctionName: "conv_add_3x3")
+ }
} else {
- super.init(device: device, inFunctionName: "conv_add_3x3")
+ fatalError()
}
- param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
+ param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
- param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
- param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+ param.filter.initBuffer(device: device, precision: computePrecision)
+ param.y.initBuffer(device: device, precision: computePrecision)
print("offset x: \(offsetX)")
print("offset y: \(offsetY)")
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
index c5d3ffe6c944ab9019f5b80e66b4691057209529..cd528bb588849958722b24ea77a0e14a6abc502e 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -51,21 +51,33 @@ class ConvBNReluKernel: Kernel, Computable, Testable {
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvBNReluParam) {
-
- if param.filter.width == 1 && param.filter.height == 1 {
- super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
- } else if param.filter.channel == 1 {
- super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+ if computePrecision == .Float32 {
+ if param.filter.width == 1 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+ } else if param.filter.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+ } else {
+ super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+ }
+ } else if computePrecision == .Float16 {
+ if param.filter.width == 1 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
+ } else if param.filter.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
+ } else {
+ super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
+ }
} else {
- super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+ fatalError()
}
- param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
- param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
- param.variance.initBuffer(device: device)
- param.mean.initBuffer(device: device)
- param.scale.initBuffer(device: device)
- param.bias.initBuffer(device: device)
+ param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+ param.filter.initBuffer(device: device, precision: computePrecision)
+
+ param.variance.initBuffer(device: device, precision: .Float32)
+ param.mean.initBuffer(device: device, precision: .Float32)
+ param.scale.initBuffer(device: device, precision: .Float32)
+ param.bias.initBuffer(device: device, precision: .Float32)
let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
@@ -102,8 +114,26 @@ class ConvBNReluKernel: Kernel, Computable, Testable {
newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
}
- param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
- param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
+ var newBiaseBuffer: MTLBuffer
+ var newScaleBuffer: MTLBuffer
+
+ if computePrecision == .Float32 {
+ newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+ newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+ } else if computePrecision == .Float16 {
+
+ newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+ newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+
+ float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout.size)
+
+ float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout.size)
+ } else {
+ fatalError(" unsupport ")
+ }
+
+ param.newBiase = newBiaseBuffer
+ param.newScale = newScaleBuffer
newScale.deinitialize(count: param.scale.buffer.length)
newScale.deallocate()
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
index 680beba1ea711b389dd6117fc84f00b6079c9a60..e0485851fd610781f475eb43be1ce6fd4937a4ef 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -39,7 +39,7 @@ class ConvKernel: Kernel, Computable {
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
let offsetZ = 0.0
- param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+ param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
index 5e8d92054bd2fa15af2d3e75860c0dc4d9b93e5c..b6db7231e83943dbce6f2cbe3266af9fbe508aef 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -28,7 +28,7 @@ class PoolKernel: Kernel, Computable{
required init(device: MTLDevice, param: PoolParam) {
super.init(device: device, inFunctionName: "pool")
- param.output.initTexture(device: device, inTranspose: param.input.transpose)
+ param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
}
func compute(commandBuffer: MTLCommandBuffer, param: PoolParam
) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
index ad925eb174414ed8f48cc8dd5bf090bc2ed0aed2..1545a848dacb4f11a2a68df31f7ea49a23799a87 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
@@ -17,8 +17,8 @@ class PreluKernel: Kernel, Computable{
} else {
super.init(device: device, inFunctionName: "prelu_other")
}
- param.alpha.initBuffer(device: device)
- param.output.initTexture(device: device, inTranspose: param.input.transpose)
+ param.alpha.initBuffer(device: device, precision: computePrecision)
+ param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
}
func compute(commandBuffer: MTLCommandBuffer, param: PreluParam) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
index e2363e44d3a3d81b430f82303b2b1017ddfc5200..ece3e3915dd130c48ef717f29cd95a13eca52d0c 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -33,11 +33,16 @@ class PriorBoxKernel: Kernel, Computable{
var metalParam: PriorBoxMetalParam!
required init(device: MTLDevice, param: PriorBoxParam) {
- super.init(device: device, inFunctionName: "prior_box")
- param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3])
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "prior_box")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "prior_box_half")
+ } else {
+ fatalError()
+ }
-
- param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3])
+ param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+ param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
let n = 1
let h = param.output.dim[1]
@@ -79,7 +84,18 @@ class PriorBoxKernel: Kernel, Computable{
}
}
- param.newAspectRatios = outputAspectRatior
+ if computePrecision == .Float16 {
+ let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout.size)
+ float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
+ param.newAspectRatios = buffer
+
+ } else if computePrecision == .Float32 {
+ let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout.size, options: [])
+ param.newAspectRatios = buffer
+ } else {
+ fatalError()
+ }
+
let aspectRatiosSize = uint(outputAspectRatior.count)
let maxSizeSize: uint = uint(param.maxSizes.count)
@@ -102,12 +118,13 @@ class PriorBoxKernel: Kernel, Computable{
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setTexture(param.outputVariances.metalTexture, index: 2)
- encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0)
- encoder.setBytes(param.newAspectRatios!, length: MemoryLayout.size * param.newAspectRatios!.count, index: 1)
+
+ encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
+
+ encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 1)
+
encoder.setBytes(param.variances, length: MemoryLayout.size * param.variances.count, index: 2)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
-
-
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
index 96a1abb6df964ee24d74ca9979ca59512f4e4265..3916c07ce5e8d4f3179a8a3100563a77e68eb53b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -15,58 +15,65 @@
import Foundation
struct ReshapeMetalParam {
- var idim: (Int32, Int32, Int32, Int32)
- var itrans: (Int32, Int32, Int32, Int32)
- var odim: (Int32, Int32, Int32, Int32)
- var otrans: (Int32, Int32, Int32, Int32)
+ var idim: (Int32, Int32, Int32, Int32)
+ var itrans: (Int32, Int32, Int32, Int32)
+ var odim: (Int32, Int32, Int32, Int32)
+ var otrans: (Int32, Int32, Int32, Int32)
}
struct ReshapeTestParam: TestParam {
- let inputTexture: MTLTexture
- let outputTexture: MTLTexture
- let param: ReshapeMetalParam
+ let inputTexture: MTLTexture
+ let outputTexture: MTLTexture
+ let param: ReshapeMetalParam
}
class ReshapeKernel: Kernel, Computable{
- required init(device: MTLDevice, param: ReshapeParam) {
- param.output.initTexture(device: device)
- super.init(device: device, inFunctionName: "reshape")
+ required init(device: MTLDevice, param: ReshapeParam
) {
+ param.output.initTexture(device: device, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "reshape")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "reshape_half")
+ } else {
+ fatalError()
}
-
- required init(device: MTLDevice, testParam: ReshapeTestParam) {
- super.init(device: device, inFunctionName: "reshape")
- }
-
- func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam
) throws {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- throw PaddleMobileError.predictError(message: " encoder is nil")
- }
- encoder.setTexture(param.input.metalTexture, index: 0)
- encoder.setTexture(param.output.metalTexture, index: 1)
- let id: [Int32] = (0..<4).map { Int32(param.input.dim[$0]) }
- let it: [Int32] = param.input.transpose.map { Int32($0) }
- let od: [Int32] = (0..<4).map { Int32(param.output.dim[$0]) }
- let ot: [Int32] = param.output.transpose.map { Int32($0) }
- var rmp = ReshapeMetalParam.init(
- idim: (id[0], id[1], id[2], id[3]),
- itrans: (it[0], it[1], it[2], it[3]),
- odim: (od[0], od[1], od[2], od[3]),
- otrans: (ot[0], ot[1], ot[2], ot[3])
- )
- encoder.setBytes(&rmp, length: MemoryLayout.size, index: 0)
- encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
- encoder.endEncoding()
+ }
+
+ required init(device: MTLDevice, testParam: ReshapeTestParam) {
+ super.init(device: device, inFunctionName: "reshape")
+ }
+
+ func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam) throws {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ throw PaddleMobileError.predictError(message: " encoder is nil")
}
- func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- fatalError()
- }
- encoder.setTexture(testParam.inputTexture, index: 0)
- encoder.setTexture(testParam.outputTexture, index: 1)
- var pm: ReshapeMetalParam = testParam.param
- encoder.setBytes(&pm, length: MemoryLayout.size, index: 0)
- encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
- encoder.endEncoding()
+ encoder.setTexture(param.input.metalTexture, index: 0)
+ encoder.setTexture(param.output.metalTexture, index: 1)
+ let id: [Int32] = (0..<4).map { Int32(param.input.dim[$0]) }
+ let it: [Int32] = param.input.transpose.map { Int32($0) }
+ let od: [Int32] = (0..<4).map { Int32(param.output.dim[$0]) }
+ let ot: [Int32] = param.output.transpose.map { Int32($0) }
+ var rmp = ReshapeMetalParam.init(
+ idim: (id[0], id[1], id[2], id[3]),
+ itrans: (it[0], it[1], it[2], it[3]),
+ odim: (od[0], od[1], od[2], od[3]),
+ otrans: (ot[0], ot[1], ot[2], ot[3])
+ )
+ encoder.setBytes(&rmp, length: MemoryLayout.size, index: 0)
+ encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+ encoder.endEncoding()
+ }
+
+ func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ fatalError()
}
+ encoder.setTexture(testParam.inputTexture, index: 0)
+ encoder.setTexture(testParam.outputTexture, index: 1)
+ var pm: ReshapeMetalParam = testParam.param
+ encoder.setBytes(&pm, length: MemoryLayout.size, index: 0)
+ encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+ encoder.endEncoding()
+ }
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
index 5d2d5b1c7af5d9822394d2e7de9b251085c035dc..6f6d0af477f62d7f438b8b6a38c825c2eb95163f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -38,7 +38,13 @@ class SoftmaxKernel: Kernel, Computable{
}
required init(device: MTLDevice, param: SoftmaxParam) {
- param.output.initTexture(device: device)
- super.init(device: device, inFunctionName: "softmax")
+ param.output.initTexture(device: device, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "softmax")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "softmax_half")
+ } else {
+ fatalError()
+ }
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
index 8554beea2bc336294a3c577ccb4294e59f426bdd..0943686660e4bdd91b6cd909dff04cdd497cd817 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -32,7 +32,14 @@ class Texture2DTo2DArrayKernel: Kernel, Computable{
}
required init(device: MTLDevice, param: FeedParam) {
- param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
- super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+ param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+ if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "texture2d_to_2d_array_half")
+ } else if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+ } else {
+ fatalError()
+ }
+
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
index 33e1219b4d0fff972d8db3d16fc7ce1477841351..6594b3474f0abb04364246830f79302f487af499 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -41,33 +41,27 @@ struct TransposeTestParam: TestParam {
}
class TransposeKernel: Kernel, Computable, Testable {
- var metalParam: TransposeMetalParam!
- func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam) throws {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- throw PaddleMobileError.predictError(message: " encode is nil")
- }
-
- encoder.setTexture(param.input.metalTexture, index: 0)
- encoder.setTexture(param.output.metalTexture, index: 1)
- encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0)
- encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
- encoder.endEncoding()
- }
required init(device: MTLDevice, param: TransposeParam) {
- param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3])
- super.init(device: device, inFunctionName: "transpose")
+ param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
+ if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "transpose_half")
+ } else if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "transpose")
+ } else {
+ fatalError()
+ }
var invT: [Int] = [0, 1, 2, 3]
for (i, v) in param.input.transpose.enumerated() {
invT[v] = i
}
var axis: [Int] = [0, 1, 2, 3]
-// var doNothing = false
-// if param.axis.count == param.input.transpose.count {
-// doNothing = param.axis == param.input.transpose.map { Int32($0) }
-// }
+ // var doNothing = false
+ // if param.axis.count == param.input.transpose.count {
+ // doNothing = param.axis == param.input.transpose.map { Int32($0) }
+ // }
for i in 0..: Kernel, Computable, Testable {
}
metalParam = tmp
}
+
required init(device: MTLDevice, testParam: TransposeTestParam) {
- super.init(device: device, inFunctionName: "transpose")
- fatalError()
+ if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "transpose_half")
+ } else if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "transpose")
+ } else {
+ fatalError()
+ }
+ }
+
+ var metalParam: TransposeMetalParam!
+ func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam) throws {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ throw PaddleMobileError.predictError(message: " encode is nil")
+ }
+
+ encoder.setTexture(param.input.metalTexture, index: 0)
+ encoder.setTexture(param.output.metalTexture, index: 1)
+ encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0)
+ encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+ encoder.endEncoding()
}
+
public func test(commandBuffer: MTLCommandBuffer, param: TransposeTestParam) {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
index 9a177488861c42740f3a0343b3cb41bb0b969137..7abc17ec6e7a204af4d74b28d40e2a4c69dddc4b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
@@ -34,7 +34,6 @@ kernel void boxcoder(texture2d_array priorBox [[texture(0)]
float tw = exp(pv.z * t.z) * pw;
float th = exp(pv.w * t.w) * ph;
-
float4 r;
r.x = tx - tw / 2;
r.y = ty - th / 2;
@@ -43,3 +42,31 @@ kernel void boxcoder(texture2d_array priorBox [[texture(0)]
output.write(r, gid.xy, gid.z);
}
+
+kernel void boxcoder_half(texture2d_array priorBox [[texture(0)]],
+ texture2d_array priorBoxVar [[texture(1)]],
+ texture2d_array targetBox [[texture(2)]],
+ texture2d_array output[[texture(3)]],
+ uint3 gid [[thread_position_in_grid]]) {
+ half4 t = targetBox.read(gid.xy, gid.z);
+ half4 p = priorBox.read(gid.xy, gid.z);
+ half4 pv = priorBoxVar.read(gid.xy, gid.z);
+
+ float px = (float(p.x) + float(p.z)) / 2;
+ float py = (float(p.y) + float(p.w)) / 2;
+ float pw = float(p.z) - float(p.x);
+ float ph = float(p.w) - float(p.y);
+
+ float tx = float(pv.x) * float(t.x) * pw + px;
+ float ty = float(pv.y) * float(t.y) * ph + py;
+ float tw = exp(float(pv.z) * float(t.z)) * pw;
+ float th = exp(float(pv.w) * float(t.w)) * ph;
+
+ float4 r;
+ r.x = tx - tw / 2;
+ r.y = ty - th / 2;
+ r.z = tx + tw / 2;
+ r.w = ty + th / 2;
+
+ output.write(half4(r), gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
index c5c4ffc5c995500503411148db31b2acfa3459b6..d37be42be64f8fdd7325fd62a68e646737b6dedf 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
@@ -57,3 +57,14 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
opos[trans[i]] = ipos[i];
}
}
+
+
+struct MetalConvParam {
+ short offsetX;
+ short offsetY;
+ short offsetZ;
+ ushort strideX;
+ ushort strideY;
+ ushort dilationX;
+ ushort dilationY;
+};
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
index 09c0e8dadab759bbdf514f347eff3eb005bfac2f..92d80c315e0d5ca19711b4a2165c89077979d49d 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
@@ -69,3 +69,48 @@ kernel void concat(texture2d_array in0 [[texture(0)]],
}
out.write(r, gid.xy, gid.z);
}
+
+kernel void concat_half(texture2d_array in0 [[texture(0)]],
+ texture2d_array in1 [[texture(1)]],
+ texture2d_array in2 [[texture(2)]],
+ texture2d_array in3 [[texture(3)]],
+ texture2d_array in4 [[texture(4)]],
+ texture2d_array in5 [[texture(5)]],
+ texture2d_array inx [[texture(6)]],
+ texture2d_array out [[texture(7)]],
+ constant ConcatParam & pm [[buffer(0)]],
+ uint3 gid [[thread_position_in_grid]]) {
+ ConcatParam cp = pm;
+ int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+ half4 r;
+ for (int i = 0; i < 4; i++) {
+ xyzn[3] = i;
+ xyzn2abcd(cp.odim[3], xyzn, abcd);
+ int k = abcd[cp.axis] - cp.offset;
+ int j = 0;
+ if (k < 0) {
+ r[i] = inx.read(gid.xy, gid.z)[i];
+ } else {
+ for (; j < 6; j++) {
+ if (k < cp.vdim[j]) {
+ break;
+ }
+ k -= cp.vdim[j];
+ }
+ int ta = cp.odim[cp.axis];
+ abcd[cp.axis] = k;
+ cp.odim[cp.axis] = cp.vdim[j];
+ abcd2xyzn(cp.odim[3], abcd, oxyzn);
+ cp.odim[cp.axis] = ta;
+ switch (j) {
+ case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+ case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+ case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+ case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+ case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+ case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+ }
+ }
+ }
+ out.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..ffa66212b16bb6c6180910cae2d0c34f8659c556
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
@@ -0,0 +1,308 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include
+#include "Common.metal"
+using namespace metal;
+
+
+kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 1;
+
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ half4 output = half4(0.0);
+
+ half4 input;
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
+ output.x += dot(input, weight_x);
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
+ output.y += dot(input, weight_y);
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
+ output.z += dot(input, weight_z);
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+ output.w += dot(input, weight_w);
+ }
+
+ output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ half4 output = half4(0.0);
+
+ half4 input[9];
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
+ input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
+ input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
+ input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
+ input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
+ input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
+ for (int j = 0; j < 9; ++j) {
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(input[j], weight_x);
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(input[j], weight_y);
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(input[j], weight_z);
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(input[j], weight_w);
+ }
+ }
+ output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+ uint output_slice = gid.z;
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint weithTo = gid.z * kernelHXW * 4;
+ half4 output = half4(0.0);
+ half4 inputs[9];
+ inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
+ inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
+ inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
+ inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
+ inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
+ inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
+ inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
+ inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
+ inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
+ for (int j = 0; j < 9; ++j) {
+ half4 input = inputs[j];
+ output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+ output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+ output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+ output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+ }
+ output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+
+/*---------------------------------------------*/
+
+
+
+kernel void conv_add_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float4 *weights [[buffer(1)]],
+ const device float4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 1;
+
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ float4 input;
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
+ output.x += dot(input, weight_x);
+
+ float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
+ output.y += dot(input, weight_y);
+
+ float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
+ output.z += dot(input, weight_z);
+
+ float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+ output.w += dot(input, weight_w);
+ }
+ output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float4 *weights [[buffer(1)]],
+ const device float4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ float4 input[9];
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
+ input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
+ input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
+ input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
+ input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
+ input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
+ for (int j = 0; j < 9; ++j) {
+ float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(input[j], weight_x);
+
+ float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(input[j], weight_y);
+
+ float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(input[j], weight_z);
+
+ float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(input[j], weight_w);
+ }
+ }
+ output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float *weights [[buffer(1)]],
+ const device float4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+ uint output_slice = gid.z;
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint weithTo = gid.z * kernelHXW * 4;
+ float4 output = float4(0.0);
+ float4 inputs[9];
+ inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
+ inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
+ inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
+ inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
+ inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
+ inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
+ inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
+ inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
+ inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
+ for (int j = 0; j < 9; ++j) {
+ float4 input = inputs[j];
+ output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+ output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+ output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+ output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+ }
+ output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
new file mode 100644
index 0000000000000000000000000000000000000000..9244b2ec4631015ffd192567f734bee4cc1c7c85
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
@@ -0,0 +1,306 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - convAdd
+kernel void conv_add_1x1(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float4 *weights [[buffer(1)]],
+ const device float4 *biase [[buffer(2)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 1;
+
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ float4 input;
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
+ output.x += dot(input, weight_x);
+
+ float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
+ output.y += dot(input, weight_y);
+
+ float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
+ output.z += dot(input, weight_z);
+
+ float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+ output.w += dot(input, weight_w);
+ }
+ output = output + biase[gid.z];
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float4 *weights [[buffer(1)]],
+ const device float4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ ushort dilation_x = param.dilationX;
+ ushort dilation_y = param.dilationY;
+
+ float4 input[9];
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+ input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+ input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+ input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
+ input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+ input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
+ for (int j = 0; j < 9; ++j) {
+ float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(input[j], weight_x);
+
+ float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(input[j], weight_y);
+
+ float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(input[j], weight_z);
+
+ float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(input[j], weight_w);
+ }
+ }
+ output = output + biase[gid.z];
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float *weights [[buffer(1)]],
+ const device float4 *biase [[buffer(2)]],
+ const device float4 *new_scale [[buffer(3)]],
+ const device float4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+ uint output_slice = gid.z;
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint weithTo = gid.z * kernelHXW * 4;
+ float4 output = float4(0.0);
+ float4 inputs[9];
+ inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
+ inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
+ inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
+ inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
+ inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
+ inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
+ inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
+ inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
+ inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
+ for (int j = 0; j < 9; ++j) {
+ float4 input = inputs[j];
+ output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+ output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+ output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+ output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+ }
+ output = output + biase[gid.z];
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+
+#pragma mark - half
+
+kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 1;
+
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ half4 input;
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
+ output.x += dot(input, weight_x);
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
+ output.y += dot(input, weight_y);
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
+ output.z += dot(input, weight_z);
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+ output.w += dot(input, weight_w);
+ }
+ output = output + float4(biase[gid.z]);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ const device half4 *new_scale [[buffer(3)]],
+ const device half4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ ushort dilation_x = param.dilationX;
+ ushort dilation_y = param.dilationY;
+
+ half4 input[9];
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+ input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+ input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+ input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
+ input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+ input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
+ for (int j = 0; j < 9; ++j) {
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(float4(input[j]), float4(weight_x));
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(float4(input[j]), float4(weight_y));
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(float4(input[j]), float4(weight_z));
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(float4(input[j]), float4(weight_w));
+ }
+ }
+ output = output + float4(biase[gid.z]);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ const device half4 *new_scale [[buffer(3)]],
+ const device half4 *new_biase [[buffer(4)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+ uint output_slice = gid.z;
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint weithTo = gid.z * kernelHXW * 4;
+ float4 output = float4(0.0);
+ half4 inputs[9];
+ inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
+ inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
+ inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
+ inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
+ inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
+ inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
+ inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
+ inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
+ inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
+ for (int j = 0; j < 9; ++j) {
+ half4 input = inputs[j];
+ output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+ output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+ output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+ output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+ }
+ output = output + float4(biase[gid.z]);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4b97b7829a1fba27704fe7b60a03b2672f4f5953
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
@@ -0,0 +1,297 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - conv bn relu
+kernel void conv_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float4 *weights [[buffer(1)]],
+ const device float4 *new_scale [[buffer(2)]],
+ const device float4 *new_biase [[buffer(3)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 1;
+
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ float4 input;
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
+ output.x += dot(input, weight_x);
+
+ float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
+ output.y += dot(input, weight_y);
+
+ float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
+ output.z += dot(input, weight_z);
+
+ float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+ output.w += dot(input, weight_w);
+ }
+ output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float4 *weights [[buffer(1)]],
+ const device float4 *new_scale [[buffer(2)]],
+ const device float4 *new_biase [[buffer(3)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ float4 input[9];
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
+ input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
+ input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
+ input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
+ input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
+ input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
+ for (int j = 0; j < 9; ++j) {
+ float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(input[j], weight_x);
+
+ float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(input[j], weight_y);
+
+ float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(input[j], weight_z);
+
+ float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(input[j], weight_w);
+ }
+ }
+ output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device float *weights [[buffer(1)]],
+ const device float4 *new_scale [[buffer(2)]],
+ const device float4 *new_biase [[buffer(3)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+ uint output_slice = gid.z;
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint weithTo = gid.z * kernelHXW * 4;
+ float4 output = float4(0.0);
+ float4 inputs[9];
+ inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
+ inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
+ inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
+ inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
+ inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
+ inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
+ inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
+ inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
+ inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
+ for (int j = 0; j < 9; ++j) {
+ float4 input = inputs[j];
+ output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+ output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+ output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+ output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+ }
+ output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+#pragma mark - half
+kernel void conv_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *new_scale [[buffer(2)]],
+ const device half4 *new_biase [[buffer(3)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 1;
+
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ half4 input;
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
+ output.x += dot(float4(input), float4(weight_x));
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
+ output.y += dot(float4(input), float4(weight_y));
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
+ output.z += dot(float4(input), float4(weight_z));
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+ output.w += dot(float4(input), float4(weight_w));
+ }
+ output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *new_scale [[buffer(2)]],
+ const device half4 *new_biase [[buffer(3)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ half4 input[9];
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
+ input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
+ input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
+ input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
+ input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
+ input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
+ for (int j = 0; j < 9; ++j) {
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(float4(input[j]), float4(weight_x));
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(float4(input[j]), float4(weight_y));
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(float4(input[j]), float4(weight_z));
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(float4(input[j]), float4(weight_w));
+ }
+ }
+ output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half *weights [[buffer(1)]],
+ const device half4 *new_scale [[buffer(2)]],
+ const device half4 *new_biase [[buffer(3)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+ uint output_slice = gid.z;
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint weithTo = gid.z * kernelHXW * 4;
+ float4 output = float4(0.0);
+ half4 inputs[9];
+ inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
+ inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
+ inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
+ inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
+ inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
+ inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
+ inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
+ inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
+ inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
+ for (int j = 0; j < 9; ++j) {
+ half4 input = inputs[j];
+ output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+ output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+ output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+ output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+ }
+ output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
index 3254346c8da2b63420c6dcaf540e3c28f663b890..35c2c56cace5975c5fe166b007bb695eba163325 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -13,349 +13,9 @@
limitations under the License. */
#include
+#include "Common.metal"
using namespace metal;
-struct MetalConvParam {
- short offsetX;
- short offsetY;
- short offsetZ;
- ushort strideX;
- ushort strideY;
- ushort dilationX;
- ushort dilationY;
-};
-
-kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device half4 *weights [[buffer(1)]],
- const device half4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 1;
-
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- half4 output = half4(0.0);
-
- half4 input;
- for (uint i = 0; i < input_arr_size; ++i) {
- input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
- output.x += dot(input, weight_x);
-
- half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
- output.y += dot(input, weight_y);
-
- half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
- output.z += dot(input, weight_z);
-
- half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
- output.w += dot(input, weight_w);
- }
-
- output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device half4 *weights [[buffer(1)]],
- const device half4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- half4 output = half4(0.0);
-
- half4 input[9];
- for (uint i = 0; i < input_arr_size; ++i) {
- input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
- input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
- input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
- input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
- input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
- input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
- input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
- input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
- for (int j = 0; j < 9; ++j) {
- half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.x += dot(input[j], weight_x);
-
- half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.y += dot(input[j], weight_y);
-
- half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.z += dot(input[j], weight_z);
-
- half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.w += dot(input[j], weight_w);
- }
- }
- output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device half4 *weights [[buffer(1)]],
- const device half4 *biase [[buffer(2)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 1;
-
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- half4 output = half4(0.0);
-
- half4 input;
- for (uint i = 0; i < input_arr_size; ++i) {
- input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
- output.x += dot(input, weight_x);
-
- half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
- output.y += dot(input, weight_y);
-
- half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
- output.z += dot(input, weight_z);
-
- half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
- output.w += dot(input, weight_w);
- }
- output = output + biase[gid.z];
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device half *weights [[buffer(1)]],
- const device half4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
- uint output_slice = gid.z;
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint weithTo = gid.z * kernelHXW * 4;
- half4 output = half4(0.0);
- half4 inputs[9];
- inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
- inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
- inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
- inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
- inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
- inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
- inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
- inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
- inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
- for (int j = 0; j < 9; ++j) {
- half4 input = inputs[j];
- output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
- output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
- output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
- output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
- }
- output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
- outTexture.write(output, gid.xy, gid.z);
-}
-
-
-/*---------------------------------------------*/
-
-
-
-kernel void conv_add_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float4 *weights [[buffer(1)]],
- const device float4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 1;
-
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- float4 output = float4(0.0);
-
- float4 input;
- for (uint i = 0; i < input_arr_size; ++i) {
- input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
- output.x += dot(input, weight_x);
-
- float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
- output.y += dot(input, weight_y);
-
- float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
- output.z += dot(input, weight_z);
-
- float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
- output.w += dot(input, weight_w);
- }
- output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float4 *weights [[buffer(1)]],
- const device float4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- float4 output = float4(0.0);
-
- float4 input[9];
- for (uint i = 0; i < input_arr_size; ++i) {
- input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
- input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
- input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
- input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
- input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
- input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
- input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
- input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
- for (int j = 0; j < 9; ++j) {
- float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.x += dot(input[j], weight_x);
-
- float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.y += dot(input[j], weight_y);
-
- float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.z += dot(input[j], weight_z);
-
- float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.w += dot(input[j], weight_w);
- }
- }
- output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float *weights [[buffer(1)]],
- const device float4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
- uint output_slice = gid.z;
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint weithTo = gid.z * kernelHXW * 4;
- float4 output = float4(0.0);
- float4 inputs[9];
- inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
- inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
- inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
- inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
- inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
- inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
- inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
- inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
- inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
- for (int j = 0; j < 9; ++j) {
- float4 input = inputs[j];
- output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
- output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
- output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
- output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
- }
- output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
- outTexture.write(output, gid.xy, gid.z);
-}
-
// conv
#pragma mark -- conv
kernel void conv_3x3(texture2d_array inTexture [[texture(0)]],
@@ -487,286 +147,5 @@ kernel void conv_1x1(texture2d_array inTexture [[texture(
outTexture.write(output, gid.xy, gid.z);
}
-#pragma mark - convAdd
-kernel void conv_add_1x1(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float4 *weights [[buffer(1)]],
- const device float4 *biase [[buffer(2)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 1;
-
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- float4 output = float4(0.0);
-
- float4 input;
- for (uint i = 0; i < input_arr_size; ++i) {
- input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
- output.x += dot(input, weight_x);
-
- float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
- output.y += dot(input, weight_y);
-
- float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
- output.z += dot(input, weight_z);
-
- float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
- output.w += dot(input, weight_w);
- }
- output = output + biase[gid.z];
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float4 *weights [[buffer(1)]],
- const device float4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- float4 output = float4(0.0);
-
- ushort dilation_x = param.dilationX;
- ushort dilation_y = param.dilationY;
-
- float4 input[9];
- for (uint i = 0; i < input_arr_size; ++i) {
- input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
- input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
- input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
- input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
- input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
- input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i);
- input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
- input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i);
- for (int j = 0; j < 9; ++j) {
- float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.x += dot(input[j], weight_x);
-
- float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.y += dot(input[j], weight_y);
-
- float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.z += dot(input[j], weight_z);
-
- float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.w += dot(input[j], weight_w);
- }
- }
- output = output + biase[gid.z];
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float *weights [[buffer(1)]],
- const device float4 *biase [[buffer(2)]],
- const device float4 *new_scale [[buffer(3)]],
- const device float4 *new_biase [[buffer(4)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
- uint output_slice = gid.z;
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint weithTo = gid.z * kernelHXW * 4;
- float4 output = float4(0.0);
- float4 inputs[9];
- inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
- inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
- inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
- inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
- inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
- inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
- inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
- inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
- inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
- for (int j = 0; j < 9; ++j) {
- float4 input = inputs[j];
- output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
- output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
- output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
- output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
- }
- output = output + biase[gid.z];
- outTexture.write(output, gid.xy, gid.z);
-}
-
-#pragma mark - conv bn relu
-kernel void conv_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float4 *weights [[buffer(1)]],
- const device float4 *new_scale [[buffer(2)]],
- const device float4 *new_biase [[buffer(3)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 1;
-
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- float4 output = float4(0.0);
-
- float4 input;
- for (uint i = 0; i < input_arr_size; ++i) {
- input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
- output.x += dot(input, weight_x);
-
- float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
- output.y += dot(input, weight_y);
-
- float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
- output.z += dot(input, weight_z);
-
- float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
- output.w += dot(input, weight_w);
- }
- output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float4 *weights [[buffer(1)]],
- const device float4 *new_scale [[buffer(2)]],
- const device float4 *new_biase [[buffer(3)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
-
- ushort2 stride = ushort2(param.strideX, param.strideY);
- const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint input_arr_size = inTexture.get_array_size();
- uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
- float4 output = float4(0.0);
-
- float4 input[9];
- for (uint i = 0; i < input_arr_size; ++i) {
- input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
- input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
- input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
- input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
- input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
- input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
- input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
- input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
- input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
- for (int j = 0; j < 9; ++j) {
- float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.x += dot(input[j], weight_x);
-
- float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.y += dot(input[j], weight_y);
-
- float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.z += dot(input[j], weight_z);
-
- float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
- output.w += dot(input[j], weight_w);
- }
- }
- output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
- outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvParam ¶m [[buffer(0)]],
- const device float *weights [[buffer(1)]],
- const device float4 *new_scale [[buffer(2)]],
- const device float4 *new_biase [[buffer(3)]],
- uint3 gid [[thread_position_in_grid]]) {
-
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) {
- return;
- }
- uint output_slice = gid.z;
- ushort2 stride = ushort2(param.strideX, param.strideY);
- ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
- constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
- const uint kernelHXW = 9;
- uint weithTo = gid.z * kernelHXW * 4;
- float4 output = float4(0.0);
- float4 inputs[9];
- inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
- inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
- inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
- inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
- inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
- inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
- inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
- inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
- inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
- for (int j = 0; j < 9; ++j) {
- float4 input = inputs[j];
- output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
- output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
- output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
- output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
- }
- output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
- outTexture.write(output, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
index a45063b3c09873b7c41f7540d1d33df2b1559e54..368509f001aca6361b81b9b7839cf24b2efc5c12 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
@@ -44,18 +44,6 @@ kernel void resize(texture2d inTexture [[texture(0)]],
}
-
-//kernel void texture2d_to_2d_array(texture2d inTexture [[texture(0)]],
-// texture2d_array outTexture [[texture(1)]],
-// uint3 gid [[thread_position_in_grid]]) {
-// if (gid.x >= inTexture.get_width() ||
-// gid.y >= inTexture.get_height()){
-// return;
-// }
-// const half4 input = inTexture.read(gid.xy);
-// outTexture.write(input, gid.xy, 0);
-//}
-
kernel void texture2d_to_2d_array(texture2d inTexture [[texture(0)]],
texture2d_array outTexture [[texture(1)]],
uint3 gid [[thread_position_in_grid]]) {
@@ -67,10 +55,9 @@ kernel void texture2d_to_2d_array(texture2d inTexture [[tex
outTexture.write(input, gid.xy, 0);
}
-
kernel void texture2d_to_2d_array_half(texture2d inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- uint3 gid [[thread_position_in_grid]]) {
+ texture2d_array outTexture [[texture(1)]],
+ uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= inTexture.get_width() ||
gid.y >= inTexture.get_height()){
return;
@@ -79,113 +66,4 @@ kernel void texture2d_to_2d_array_half(texture2d inTexture [
outTexture.write(input, gid.xy, 0);
}
-struct PoolParam {
- int ksizeX;
- int ksizeY;
- int strideX;
- int strideY;
- int paddingX;
- int paddingY;
- int poolType;
-};
-kernel void pool(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant PoolParam &pm [[buffer(0)]],
- uint3 gid [[thread_position_in_grid]]) {
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) return;
- int xmin = gid.x * pm.strideX - pm.paddingX;
- int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
- xmin = max(xmin, 0);
- int ymin = gid.y * pm.strideX - pm.paddingX;
- int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
- ymin = max(ymin, 0);
-
- float4 r = 0;
- if (pm.poolType == 0) {
- r = inTexture.read(uint2(xmin, ymin), gid.z);
- for (int x = xmin; x < xmax; x++) {
- for (int y = ymin; y < ymax; y++) {
- r = fmax(r, inTexture.read(uint2(x, y), gid.z));
- }
- }
- } else if (pm.poolType == 1) {
- for (int x = xmin; x < xmax; x++) {
- for (int y = ymin; y < ymax; y++) {
- r += inTexture.read(uint2(x, y), gid.z);
- }
- }
- r /= pm.ksizeX * pm.ksizeY;
- }
- outTexture.write(r, gid.xy, gid.z);
-}
-
-
-kernel void pool_half(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant PoolParam &pm [[buffer(0)]],
- uint3 gid [[thread_position_in_grid]]) {
- if (gid.x >= outTexture.get_width() ||
- gid.y >= outTexture.get_height() ||
- gid.z >= outTexture.get_array_size()) return;
- int xmin = gid.x * pm.strideX - pm.paddingX;
- int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
- xmin = max(xmin, 0);
- int ymin = gid.y * pm.strideX - pm.paddingX;
- int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
- ymin = max(ymin, 0);
-
- half4 r = 0;
- if (pm.poolType == 0) {
- r = inTexture.read(uint2(xmin, ymin), gid.z);
- for (int x = xmin; x < xmax; x++) {
- for (int y = ymin; y < ymax; y++) {
- r = fmax(r, inTexture.read(uint2(x, y), gid.z));
- }
- }
- } else if (pm.poolType == 1) {
- for (int x = xmin; x < xmax; x++) {
- for (int y = ymin; y < ymax; y++) {
- r += inTexture.read(uint2(x, y), gid.z);
- }
- }
- r /= pm.ksizeX * pm.ksizeY;
- }
- outTexture.write(r, gid.xy, gid.z);
-}
-
-struct TransposeParam {
- int iC;
- int oC;
- int axis[4];
-};
-
-kernel void transpose(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant TransposeParam &pm [[buffer(0)]],
- uint3 gid [[thread_position_in_grid]]) {
-
-
- if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) {
- // do nothing
- float4 r = inTexture.read(gid.xy, gid.z);
- outTexture.write(r, gid.xy, gid.z);
- } else {
- float4 r;
- for (int n = 0; n < 4; n++) {
- int ixyzn[] = {int(gid.x), int(gid.y), int(gid.z), n};
- int iabcd[4], oabcd[4], oxyzn[4];
- xyzn2abcd(pm.oC, ixyzn, iabcd);
- oabcd[pm.axis[0]] = iabcd[0];
- oabcd[pm.axis[1]] = iabcd[1];
- oabcd[pm.axis[2]] = iabcd[2];
- oabcd[pm.axis[3]] = iabcd[3];
- abcd2xyzn(pm.iC, oabcd, oxyzn);
- float4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]);
- r[n] = rt[oxyzn[3]];
- }
- outTexture.write(r, gid.xy, gid.z);
- }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f7240db2ba716090001ed539bddb87dff5117
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include
+#include "Common.metal"
+using namespace metal;
+
+struct PoolParam {
+ int ksizeX;
+ int ksizeY;
+ int strideX;
+ int strideY;
+ int paddingX;
+ int paddingY;
+ int poolType;
+};
+
+kernel void pool(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant PoolParam &pm [[buffer(0)]],
+ uint3 gid [[thread_position_in_grid]]) {
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) return;
+ int xmin = gid.x * pm.strideX - pm.paddingX;
+ int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+ xmin = max(xmin, 0);
+ int ymin = gid.y * pm.strideX - pm.paddingX;
+ int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+ ymin = max(ymin, 0);
+
+ float4 r = 0;
+ if (pm.poolType == 0) {
+ r = inTexture.read(uint2(xmin, ymin), gid.z);
+ for (int x = xmin; x < xmax; x++) {
+ for (int y = ymin; y < ymax; y++) {
+ r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+ }
+ }
+ } else if (pm.poolType == 1) {
+ for (int x = xmin; x < xmax; x++) {
+ for (int y = ymin; y < ymax; y++) {
+ r += inTexture.read(uint2(x, y), gid.z);
+ }
+ }
+ r /= pm.ksizeX * pm.ksizeY;
+ }
+ outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void pool_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant PoolParam &pm [[buffer(0)]],
+ uint3 gid [[thread_position_in_grid]]) {
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) return;
+ int xmin = gid.x * pm.strideX - pm.paddingX;
+ int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+ xmin = max(xmin, 0);
+ int ymin = gid.y * pm.strideX - pm.paddingX;
+ int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+ ymin = max(ymin, 0);
+
+ half4 r = 0;
+ if (pm.poolType == 0) {
+ r = inTexture.read(uint2(xmin, ymin), gid.z);
+ for (int x = xmin; x < xmax; x++) {
+ for (int y = ymin; y < ymax; y++) {
+ r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+ }
+ }
+ } else if (pm.poolType == 1) {
+ for (int x = xmin; x < xmax; x++) {
+ for (int y = ymin; y < ymax; y++) {
+ r += inTexture.read(uint2(x, y), gid.z);
+ }
+ }
+ r /= pm.ksizeX * pm.ksizeY;
+ }
+ outTexture.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
index 1c5b08ee7eeaaa4fd2a8b5064a6af66c77596120..bd14a146100d0a0723f73ab5fd1f95d1f8e39c97 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
@@ -15,8 +15,6 @@
#include
using namespace metal;
-
-
kernel void prelu_channel(texture2d_array inTexture [[texture(0)]],
texture2d_array outTexture [[texture(1)]],
const device float4 *alpha [[buffer(0)]],
@@ -82,3 +80,4 @@ kernel void prelu_other(texture2d_array inTexture [[textu
output.w = input.w > 0 ? input.w : (alpha_value * input.w);
outTexture.write(output, gid.xy, gid.z);
}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
index 6083c6b514a3d8a0918d585a950d915e69a045fe..794f0ea6770688f0468d9ab4f9716adf3e93dd0c 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
@@ -35,8 +35,8 @@ struct PriorBoxMetalParam {
kernel void prior_box(texture2d_array inTexture [[texture(0)]],
texture2d_array outBoxTexture [[texture(1)]],
texture2d_array varianceTexture [[texture(2)]],
- constant PriorBoxMetalParam ¶m [[buffer(0)]],
- const device float *aspect_ratios [[buffer(1)]],
+ const device float *aspect_ratios [[buffer(0)]],
+ constant PriorBoxMetalParam ¶m [[buffer(1)]],
const device float4 *variances [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outBoxTexture.get_width() ||
@@ -96,3 +96,68 @@ kernel void prior_box(texture2d_array inTexture [[texture(0
}
}
+
+kernel void prior_box_half(texture2d_array