"; };
@@ -326,7 +324,6 @@
FCEB6837212F00B100D2448E /* metal */,
FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
- FC1B186520ECF1C600678B91 /* ResizeKernel.swift */,
FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
@@ -506,7 +503,6 @@
FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */,
FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
- FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
index 2b7636ce00ba2ec56aabd7e732039973ee8efdb3..afbdccce5d54eff69d07ce7546679cf3781a27d2 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -113,7 +113,7 @@ extension MTLDevice {
return tensor
}
- func tensor2texture(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> MTLTexture {
+ func tensor2texture
(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
if value.count > 0 {
assert(value.count == dim.reduce(1) { $0 * $1 })
}
@@ -129,7 +129,13 @@ extension MTLDevice {
textureDesc.height = ndim[1]
textureDesc.depth = 1
textureDesc.usage = [.shaderRead, .shaderWrite]
- textureDesc.pixelFormat = .rgba32Float
+
+ if inComputePrecision == .Float16 {
+ textureDesc.pixelFormat = .rgba16Float
+ } else if inComputePrecision == .Float32 {
+ textureDesc.pixelFormat = .rgba32Float
+ }
+
textureDesc.textureType = .type2DArray
textureDesc.storageMode = .shared
textureDesc.cpuCacheMode = .defaultCache
@@ -354,13 +360,8 @@ public extension MTLTexture {
}
// n c h w - dim
- func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] {
-// print("origin dim: \(dim)")
- print("texture: ")
- print(self)
+ func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
var textureArray: [Float32]
-// if texturePrecision == .Float16
-
if pixelFormat == .rgba32Float {
textureArray = floatArray { (i : Float32) -> Float32 in
return i
@@ -388,11 +389,10 @@ public extension MTLTexture {
}
}
}
- print(" tensor count -- \(output.count)")
return output
}
- func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] {
+ func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
// print("origin dim: \(dim)")
// print("texture: ")
// print(self)
diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift
index 757c13404a80b695946d0daa6121b67d0be46d0e..0085cac22fb61ecb27f65beaef897d496dc9dd0c 100644
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -14,18 +14,18 @@
import Foundation
-let testTo = 61
+let testTo = 161
var isTest = false
-let computePrecision: ComputePrecision = .Float32
+let computePrecision: ComputePrecision = .Float16
-public class ResultHolder {
+public class ResultHolder {
public let dim: [Int]
- public let resultArr: [P]
+ public let resultArr: [Float32]
public var intermediateResults: [String : [Variant]]?
public let elapsedTime: Double
- public init(inDim: [Int], inResult: [P], inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
+ public init(inDim: [Int], inResult: [Float32], inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
dim = inDim
resultArr = inResult
elapsedTime = inElapsedTime
@@ -78,7 +78,7 @@ public class Executor {
}
}
- public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (ResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+ public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (ResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
guard let buffer = queue.makeCommandBuffer() else {
throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
}
@@ -114,12 +114,10 @@ public class Executor {
buffer.addCompletedHandler { (commandbuffer) in
-// let inputArr = resInput.floatArray(res: { (p:P) -> P in
-// return p
-// })
-// print(inputArr.strideArray())
+// let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+//// print(inputArr.strideArray())
//
-// writeToLibrary(fileName: "genet_input_hand", array: inputArr)
+// writeToLibrary(fileName: "test_image_ssd", array: inputArr)
// print("write to library done")
// return
// print(inputArr)
@@ -133,23 +131,23 @@ public class Executor {
print(" 第 \(i) 个 op: ")
op.delogOutput()
}
-// self.ops[59].delogOutput()
+
+// return;
+// self.ops[testTo - 2].delogOutput()
+// self.ops[testTo - 1].delogOutput()
// self.ops[60].delogOutput()
- return
+// return
let afterDate = Date.init()
-
- var resultHolder: ResultHolder
+ var resultHolder: ResultHolder
if except > 0 {
- resultHolder = ResultHolder
.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
+ resultHolder = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
} else {
let outputVar: Variant = self.program.scope.output()!
let output: Texture
= outputVar as! Texture
- resultHolder = ResultHolder
.init(inDim: output.dim.dims, inResult: output.metalTexture.floatArray(res: { (p:P) -> P in
- return p
- }), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
+ resultHolder = ResultHolder.init(inDim: output.dim.dims, inResult: output.toTensor(), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
}
completionHandle(resultHolder)
diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift
index 4b5f91f9c771cd5d9031a548cdc1803fc417bbcc..68ce8c0691cd29a227758dc750315b45c305ed92 100644
--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Loader.swift
@@ -159,7 +159,7 @@ public class Loader {
} catch let error {
throw error
}
- tensor.convert(to: DataLayout.NHWC())
+// tensor.convert(to: DataLayout.NHWC())
// tensor.initBuffer(device: device)
scope[varDesc.name] = tensor
} else {
@@ -168,7 +168,7 @@ public class Loader {
}
} else {
if varDesc.name == fetchKey {
- scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: 0.0)
+ scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: 0.0)
} else if varDesc.name == feedKey {
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
index 193a271ccfc3ea3a68429f227394552c9f609f6f..d7c1aba2494479f30806f80f9ccefe943b5f7101 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -59,28 +59,28 @@ class BoxcoderOp: Operator, BoxcoderParam
func delogOutput() {
print(" \(type) output: ")
-// let priorBoxOriginDim = para.priorBox.originDim
-// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3]))
+// let priorBoxpadToFourDim = para.priorBox.padToFourDim
+// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxpadToFourDim[0], h: priorBoxpadToFourDim[1], w: priorBoxpadToFourDim[2], c: priorBoxpadToFourDim[3]))
// print(" prior box ")
// print(priorBoxArray.strideArray())
//
-// let priorBoxVarOriginDim = para.priorBoxVar.originDim
-// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3]))
+// let priorBoxVarpadToFourDim = para.priorBoxVar.padToFourDim
+// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarpadToFourDim[0], h: priorBoxVarpadToFourDim[1], w: priorBoxVarpadToFourDim[2], c: priorBoxVarpadToFourDim[3]))
// print(" prior box var ")
// print(priorBoxVarArray.strideArray())
//
-// let targetBoxOriginDim = para.targetBox.originDim
-// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]))
+// let targetBoxpadToFourDim = para.targetBox.padToFourDim
+// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
// print(" target box ")
// print(targetBoxArray.strideArray())
- let targetBoxOriginDim = para.targetBox.originDim
- let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]), texturePrecision: computePrecision)
+ let targetBoxpadToFourDim = para.targetBox.padToFourDim
+ let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
print(" target box ")
print(targetBoxArray.strideArray())
- let originDim = para.output.originDim
- let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+ let padToFourDim = para.output.padToFourDim
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(" output ")
print(outputArray.strideArray())
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
index 67e564dddd0aadcb059767380e7946040e8cd284..1abdb66aa7bdf89759a5987e3dde523c1f1dcf41 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -65,12 +65,12 @@ class ConcatOp: Operator, ConcatParam>, Run
func delogOutput() {
print(" \(type) output: ")
- let originDim = para.output.originDim
+ let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] {
- let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
- print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
} else {
fatalError(" not implemet")
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
index 7bced214bd11bfef61eb405d59073f004e765e03..43935b65d1442d7c2e1ca3db49168140569c433f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -34,7 +34,7 @@ class ConvAddBatchNormReluParam: OpParam {
scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
- y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.inputs, from: inScope)
+ y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch let error {
throw error
}
@@ -112,7 +112,7 @@ class ConvAddBatchNormReluOp: Operator: Operator, ConvAddParam>,
}
func delogOutput() {
+// print("op \(type): ")
+// print(" padding: ")
+// print(para.paddings)
+// print("stride: ")
+// print(para.stride)
+// print("dilations: ")
+// print(para.dilations)
+// print(" para input dim: ")
+// print(para.input.dim)
+// print(" para filter dim: ")
+// print(para.filter.dim)
+// print(" para output dim: ")
+// print(para.output.dim)
+// print(" biase: ")
+// let biase: [Float32] = para.y.buffer.array()
+// print(biase)
- print(" padding: ")
- print(para.paddings)
- print("stride: ")
- print(para.stride)
- print("dilations: ")
- print(para.dilations)
print(" \(type) output: ")
- print(" para input dim: ")
- print(para.input.dim)
- print(" para filter dim: ")
- print(para.filter.dim)
- print(" para output dim: ")
- print(para.output.dim)
- print(" biase: ")
- let biase: [Float32] = para.y.buffer.array()
- print(biase)
-
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
index be8c57d3ace01dabd652e0e80a43c5a053213e28..959fe44b98dabec2b39fdfdb438d482d720caa61 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
@@ -110,7 +110,7 @@ class ConvBNReluOp: Operator, ConvBNReluPa
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
index 98f4a077671bbc3161f147bf73c9691dd1f86536..e82eb1f4753f0ebfdb5a949c85181a0ae52ea2da 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
@@ -75,7 +75,7 @@ class ConvOp: Operator, ConvParam>, Runable,
func delogOutput() {
print("conv output : ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+ print(para.output.toTensor().strideArray())
// let _: Float16? = para.output.metalTexture.logDesc()
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
index 387fa420b68f8004a12af85ca398cf306f41a5c6..e1b62c24f4bec7104ba7489b56460884acc0cc21 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
@@ -43,13 +43,15 @@ class ConvTransposeOp: Operator, ConvTr
}
func delogOutput() {
+
print(" \(type) output: ")
- let originDim = para.output.originDim
+ let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] {
- let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
+ let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
+ print(output.strideArray())
} else {
print(" not implement")
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
index 36f477bc1cb48007b5b28bf27a7424940918025b..ec76eecf1fc9736d9dff6a4cf0d69a314a9b1e0d 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
@@ -58,6 +58,6 @@ class DepthConvOp: Operator, ConvParam>, Runa
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
index 0ea8a62c5c0bf30da200add2a96410136d2f40fb..8575cfd88c7ddea2f007cad21507b4620c87d3e2 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
@@ -65,6 +65,6 @@ class DwConvBNReluOp: Operator, ConvBNRelu
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
index 0f96b204d59f3d4a0dd0fae20340811855421c95..7fac84812b115f28b391d5dfe29a4aed0fd46969 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -71,12 +71,15 @@ class ElementwiseAddOp: Operator, Elem
// print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
print(" \(type) output: ")
- let originDim = para.output.originDim
+
+ print(para.inputY)
+
+ let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] {
- let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} else {
print(" not implement")
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
index b6075a807d1637c554587ea99724d6ff1f38e7e6..382ea58b844b25bb855ed7cdc155a860bca45da5 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
@@ -61,7 +61,7 @@ class FeedOp: Operator, FeedParam<
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
index bae452dec331957ceda5a6f503802352f63a6dbe..bf0a2959fa9627f5cbe994dcd183f9ed7f04456c 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
@@ -15,53 +15,60 @@
import Foundation
class BatchNormKernel: Kernel, Computable {
- var newScale: MTLBuffer
- var newBias: MTLBuffer
+ var newScale: MTLBuffer
+ var newBias: MTLBuffer
+
+ required init(device: MTLDevice, param: BatchNormParam) {
+ guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
+ fatalError()
+ }
+
+ guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
+ fatalError()
+ }
+ self.newScale = newScale
+ self.newBias = newBias
+
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "batchnorm")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "batchnorm_half")
+ } else {
+ fatalError()
+ }
+
+ let varianceBuffer : MTLBuffer = param.inputVariance.buffer
- required init(device: MTLDevice, param: BatchNormParam
) {
- guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
- fatalError()
- }
- guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
- fatalError()
- }
- self.newScale = newScale
- self.newBias = newBias
-
- super.init(device: device, inFunctionName: "batchnorm")
-
- let varianceBuffer : MTLBuffer = param.inputVariance.buffer
-
- var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
- let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
- for i in 0..<(varianceBuffer.length / MemoryLayout
.stride) {
- invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
- }
-
- let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
- let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
- let scale : MTLBuffer = param.inputScale.buffer
- let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
- let bias : MTLBuffer = param.inputBias.buffer
- let biasContents = bias.contents().assumingMemoryBound(to: P.self)
- let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
-
- for i in 0..<(newScale.length / MemoryLayout
.stride) {
- newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
- newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
- }
+ var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
+ let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
+ for i in 0..<(varianceBuffer.length / MemoryLayout
.stride) {
+ invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
}
- func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam
) throws {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- throw PaddleMobileError.predictError(message: " encoder is nil")
- }
- print("BatchNorm compute")
- encoder.setTexture(param.input.metalTexture, index: 0)
- encoder.setTexture(param.output.metalTexture, index: 1)
- encoder.setBuffer(newScale, offset: 0, index: 0)
- encoder.setBuffer(newBias, offset: 0, index: 1)
- encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
- encoder.endEncoding()
+ let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
+ let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
+ let scale : MTLBuffer = param.inputScale.buffer
+ let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
+ let bias : MTLBuffer = param.inputBias.buffer
+ let biasContents = bias.contents().assumingMemoryBound(to: P.self)
+ let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
+
+ for i in 0..<(newScale.length / MemoryLayout
.stride) {
+ newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
+ newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
+ }
+ }
+
+ func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam
) throws {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ throw PaddleMobileError.predictError(message: " encoder is nil")
}
+ print("BatchNorm compute")
+ encoder.setTexture(param.input.metalTexture, index: 0)
+ encoder.setTexture(param.output.metalTexture, index: 1)
+ encoder.setBuffer(newScale, offset: 0, index: 0)
+ encoder.setBuffer(newBias, offset: 0, index: 1)
+ encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+ encoder.endEncoding()
+ }
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index bcdcc732b08e5f578bf1b35eb754cf221217144c..66324dd47086fd7c1ccffb674c0f8b8623416e0d 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -49,26 +49,37 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddBatchNormReluParam) {
-
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
-
- if param.filter.width == 1 && param.filter.height == 1 {
- super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
- } else if param.filter.channel == 1 {
- super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
- } else {
- super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
- }
-
param.filter.initBuffer(device: device, precision: computePrecision)
-
param.y.initBuffer(device: device, precision: computePrecision)
-
param.variance.initBuffer(device: device, precision: .Float32)
param.mean.initBuffer(device: device, precision: .Float32)
param.scale.initBuffer(device: device, precision: .Float32)
param.bias.initBuffer(device: device, precision: .Float32)
+ if computePrecision == .Float32 {
+ if param.filter.width == 1 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+ } else if param.filter.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+ } else if param.filter.width == 3 && param.filter.height == 3 {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+ } else {
+ fatalError(" unsupport ")
+ }
+ } else if computePrecision == .Float16 {
+ if param.filter.width == 1 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half")
+ } else if param.filter.channel == 1 {
+ super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half")
+ } else if param.filter.width == 3 && param.filter.height == 3 {
+ super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half")
+ } else {
+ fatalError(" unsupport ")
+ }
+ } else {
+ fatalError()
+ }
let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
@@ -108,10 +119,10 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable
var newBiaseBuffer: MTLBuffer
var newScaleBuffer: MTLBuffer
- if computePrecision == .Float16 {
+ if computePrecision == .Float32 {
newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
- } else if computePrecision == .Float32 {
+ } else if computePrecision == .Float16 {
newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
@@ -138,7 +149,6 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable
throw PaddleMobileError.predictError(message: " encode is nil")
}
-
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0)
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
index f2ad60fcfc8ec77e40a21a130026fcb8fa290621..d2b54d9be60694871b4d4e3ba4bd3852bc94e875 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -17,14 +17,23 @@ import Foundation
class ConvAddKernel: Kernel, Computable {
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddParam) {
-
+ param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+ param.filter.initBuffer(device: device, precision: computePrecision)
+ param.y.initBuffer(device: device, precision: computePrecision)
+
if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x1_half")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
- } else {
+ } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3_half")
+ } else if param.filter.width == 1 && param.filter.height == 5 {
+ super.init(device: device, inFunctionName: "conv_add_5x1_half")
+ } else if param.filter.width == 5 && param.filter.height == 1 {
+ super.init(device: device, inFunctionName: "conv_add_1x5_half")
+ } else {
+ fatalError(" unsupport yet ")
}
} else if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
@@ -35,22 +44,21 @@ class ConvAddKernel: Kernel, Computable {
super.init(device: device, inFunctionName: "conv_add_5x1")
} else if param.filter.width == 5 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x5")
- } else {
+ } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3")
+ } else {
+ fatalError(" unsupport yet ")
}
} else {
fatalError()
}
+
+
let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
- param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
-
- param.filter.initBuffer(device: device, precision: computePrecision)
- param.y.initBuffer(device: device, precision: computePrecision)
-
print(" function: \(functionName)")
print("offset x: \(offsetX)")
print("offset y: \(offsetY)")
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
index 4b978e24d2b968d4ee2ee8443d74ce54f502be8b..bbf4eeaaf41e321f2c7c469011f1a243ef035107 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -49,35 +49,41 @@ class ConvBNReluKernel: Kernel, Computable, Testable {
}
var metalParam: MetalConvParam!
-
+
required init(device: MTLDevice, param: ConvBNReluParam) {
+
+ param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+ param.filter.initBuffer(device: device, precision: computePrecision)
+ param.variance.initBuffer(device: device, precision: .Float32)
+ param.mean.initBuffer(device: device, precision: .Float32)
+ param.scale.initBuffer(device: device, precision: .Float32)
+ param.bias.initBuffer(device: device, precision: .Float32)
+
if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
- } else {
+ } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+ } else {
+ fatalError(" unsupport ")
}
} else if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
- } else {
+ } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
+ } else {
+ fatalError(" unsupport ")
}
} else {
fatalError()
}
- param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
- param.filter.initBuffer(device: device, precision: computePrecision)
-
- param.variance.initBuffer(device: device, precision: .Float32)
- param.mean.initBuffer(device: device, precision: .Float32)
- param.scale.initBuffer(device: device, precision: .Float32)
- param.bias.initBuffer(device: device, precision: .Float32)
+
let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
index 6c10ba8d18f3c0e386769c11867e8bb361f49b21..345136a503d8eda6ad23f85ef01eb53fa539d453 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -27,18 +27,20 @@ public struct MetalConvParam {
class ConvKernel: Kernel, Computable {
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvParam) {
+ param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_1x1")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_3x3")
- } else {
+ } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_3x3")
+ } else {
+ fatalError(" unsupport ")
}
-
+
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
let offsetZ = 0.0
- param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
index 9354972d71c776392e9b7bf621e3e4de7233ffe8..435776c850854f2fc4259e8a2089299da825f463 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
@@ -31,7 +31,27 @@ struct MetalConvTransposeParam {
class ConvTransposeKernel: Kernel, Computable{
var metalParam: MetalConvTransposeParam!
required init(device: MTLDevice, param: ConvTransposeParam) {
- super.init(device: device, inFunctionName: "conv_transpose")
+ param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+ param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true)
+ if computePrecision == .Float32 {
+ if param.stride == [2, 2] && param.stride == [2, 2] {
+ super.init(device: device, inFunctionName: "conv_transpose2x2_stride2")
+ } else {
+ fatalError(" -- conv transpose unsupported yet -- ")
+ }
+ } else if computePrecision == .Float16 {
+ if param.stride == [2, 2] && param.stride == [2, 2] {
+ super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half")
+ } else {
+ fatalError(" -- conv transpose unsupported yet -- ")
+ }
+ } else {
+ fatalError()
+ }
+
+// let filter: [Float32] = param.filter.buffer.array()
+// print(" conv transpose filter")
+// print(filter)
let kernelWidth = UInt16(param.filter.width)
let kernelHeight = UInt16(param.filter.height)
@@ -43,9 +63,7 @@ class ConvTransposeKernel: Kernel, Computable{
let dilationY = UInt16(param.dilations[1])
metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
-
- param.output.initTexture(device: device, inTranspose: param.input.transpose)
- param.filter.initBuffer(device: device)
+
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
index e8e9dffc4562f0fc411e81e25f245b09ce292b0b..a4c88016b3a442d8d6937214755db9e33e9cb28f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -15,6 +15,7 @@
import Foundation
struct ElementwiseAddMetalParam {
+ var unsafe_one_dim: Int32 = 0
var fast: Int32 = 0
var axis: Int32 = 0
var yoff: Int32 = 0
@@ -26,8 +27,14 @@ struct ElementwiseAddMetalParam {
class ElementwiseAddKernel: Kernel, Computable {
required init(device: MTLDevice, param: ElementwiseAddParam) {
- super.init(device: device, inFunctionName: "elementwise_add")
param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "elementwise_add")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "elementwise_add_half")
+ } else {
+ fatalError()
+ }
}
func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam
) throws {
@@ -59,6 +66,11 @@ class ElementwiseAddKernel: Kernel, Computable {
emp.fast = 1
}
+ // TODO:
+ if param.inputY.tensorDim.cout() == 1 {
+ emp.unsafe_one_dim = 1;
+ }
+
encoder.setBytes(&emp, length: MemoryLayout.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
index b6db7231e83943dbce6f2cbe3266af9fbe508aef..59754e57c112ebf5e300bd0ef98465e08744e4bf 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -27,8 +27,14 @@ struct PoolMetalParam {
class PoolKernel: Kernel, Computable{
required init(device: MTLDevice, param: PoolParam) {
- super.init(device: device, inFunctionName: "pool")
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "pool")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "pool_half")
+ } else {
+ fatalError()
+ }
}
func compute(commandBuffer: MTLCommandBuffer, param: PoolParam
) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
index 1545a848dacb4f11a2a68df31f7ea49a23799a87..32505faf0faeab97586352d60535d05e86e59631 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
@@ -10,15 +10,27 @@ import Foundation
class PreluKernel: Kernel, Computable{
required init(device: MTLDevice, param: PreluParam) {
- if param.mode == "channel" {
- super.init(device: device, inFunctionName: "prelu_channel")
- } else if param.mode == "element" {
- super.init(device: device, inFunctionName: "prelu_element")
- } else {
- super.init(device: device, inFunctionName: "prelu_other")
- }
param.alpha.initBuffer(device: device, precision: computePrecision)
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ if param.mode == "channel" {
+ super.init(device: device, inFunctionName: "prelu_channel")
+ } else if param.mode == "element" {
+ super.init(device: device, inFunctionName: "prelu_element")
+ } else {
+ super.init(device: device, inFunctionName: "prelu_other")
+ }
+ } else if computePrecision == .Float16 {
+ if param.mode == "channel" {
+ super.init(device: device, inFunctionName: "prelu_channel_half")
+ } else if param.mode == "element" {
+ super.init(device: device, inFunctionName: "prelu_element_half")
+ } else {
+ super.init(device: device, inFunctionName: "prelu_other_half")
+ }
+ } else {
+ fatalError()
+ }
}
func compute(commandBuffer: MTLCommandBuffer, param: PreluParam
) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
index ece3e3915dd130c48ef717f29cd95a13eca52d0c..426e91cfbe26385a7b30931d155513633d2bc988 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -33,6 +33,10 @@ class PriorBoxKernel: Kernel, Computable{
var metalParam: PriorBoxMetalParam!
required init(device: MTLDevice, param: PriorBoxParam) {
+
+ param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+ param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "prior_box")
} else if computePrecision == .Float16 {
@@ -41,9 +45,6 @@ class PriorBoxKernel: Kernel, Computable{
fatalError()
}
- param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
- param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
-
let n = 1
let h = param.output.dim[1]
let w = param.output.dim[2]
@@ -52,11 +53,11 @@ class PriorBoxKernel: Kernel, Computable{
param.output.dim = Dim.init(inDim: [n, h, w, c])
param.output.transpose = [0, 1, 2, 3]
- let imageWidth = Float32(param.inputImage.originDim[3])
- let imageHeight = Float32(param.inputImage.originDim[2])
+ let imageWidth = Float32(param.inputImage.padToFourDim[3])
+ let imageHeight = Float32(param.inputImage.padToFourDim[2])
- let featureWidth = param.input.originDim[3]
- let featureHeight = param.input.originDim[2]
+ let featureWidth = param.input.padToFourDim[3]
+ let featureHeight = param.input.padToFourDim[2]
if param.stepW == 0 || param.stepH == 0 {
param.stepW = Float32(imageWidth) / Float32(featureWidth)
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
index 3c669cf4d965f7842070c4d38427f6d1d7440db5..18f279e9f3c5226d6eea5b5e6f0a42502173071e 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
@@ -15,17 +15,23 @@
import Foundation
class ReluKernel: Kernel, Computable{
- func compute(commandBuffer: MTLCommandBuffer, param: ReluParam) throws {
- guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
- throw PaddleMobileError.predictError(message: " encode is nil")
- }
- encoder.setTexture(param.input.metalTexture, index: 0)
- encoder.setTexture(param.output.metalTexture, index: 1)
- encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
- encoder.endEncoding()
+ func compute(commandBuffer: MTLCommandBuffer, param: ReluParam
) throws {
+ guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+ throw PaddleMobileError.predictError(message: " encode is nil")
}
-
- required init(device: MTLDevice, param: ReluParam
) {
- super.init(device: device, inFunctionName: "relu")
+ encoder.setTexture(param.input.metalTexture, index: 0)
+ encoder.setTexture(param.output.metalTexture, index: 1)
+ encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+ encoder.endEncoding()
+ }
+
+ required init(device: MTLDevice, param: ReluParam
) {
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "relu")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "relu_half")
+ } else {
+ fatalError()
}
+ }
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift
deleted file mode 100644
index d2795111ad1f43c759b95aa52ed34085a4ac147a..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-//
-//import Foundation
-//import MetalPerformanceShaders
-//
-//
-//struct ResizeParam: OpParam{
-// typealias OutputType = <#type#>
-//
-// typealias ParamPrecisionType = <#type#>
-//
-// let input: MTLTexture
-// let output: MTLTexture
-// let expectDim: Dim
-//}
-//
-//struct OutputDim {
-// let width: UInt16
-// let height: UInt16
-// let strideX: UInt16
-// let strideY: UInt16
-//}
-//
-//class ResizeKernel: Kernel, Computable{
-// var lanczos: MPSImageLanczosScale
-// required init(device: MTLDevice, param: ResizeParam) {
-// lanczos = MPSImageLanczosScale.init(device: device)
-// super.init(device: device, inFunctionName: "resize")
-// }
-// func compute(commandBuffer: MTLCommandBuffer, param: ResizeParam) throws {
-//// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-//// throw PaddleMobileError.predictError(message: " encode is nil")
-//// }
-// lanczos.encode(commandBuffer: commandBuffer, sourceTexture: param.input, destinationTexture: param.output)
-//
-//// encoder.setTexture(param.input, index: 0)
-//// encoder.setTexture(param.output, index: 1)
-//// let strideX = param.input.width/param.expectDim[2]
-//// let strideY = param.input.height/param.expectDim[1]
-//// var outputDim = OutputDim.init(width: UInt16(param.expectDim[1]), height: UInt16(param.expectDim[2]), strideX: UInt16(strideX), strideY: UInt16(strideY))
-//// encoder.setBytes(&outputDim, length: MemoryLayout.size, index: 0)
-//// encoder.dispatch(computePipline: pipline, outTexture: param.output)
-//// encoder.endEncoding()
-// }
-//
-//
-//
-//
-//}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
index 6f6d0af477f62d7f438b8b6a38c825c2eb95163f..0c166c3563149d60dc8cbee451ef23e6a1fb9b93 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -21,6 +21,17 @@ struct SoftmaxMetalParam {
class SoftmaxKernel: Kernel, Computable{
+ required init(device: MTLDevice, param: SoftmaxParam) {
+ param.output.initTexture(device: device, computePrecision: computePrecision)
+ if computePrecision == .Float32 {
+ super.init(device: device, inFunctionName: "softmax")
+ } else if computePrecision == .Float16 {
+ super.init(device: device, inFunctionName: "softmax_half")
+ } else {
+ fatalError()
+ }
+ }
+
func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam
) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
@@ -32,19 +43,12 @@ class SoftmaxKernel: Kernel, Computable{
N: Int32(param.input.tensorDim[0]),
K: Int32(param.input.tensorDim[1])
)
+
+ print(" soft max param: ")
+ print(smp)
encoder.setBytes(&smp, length: MemoryLayout.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
- required init(device: MTLDevice, param: SoftmaxParam) {
- param.output.initTexture(device: device, computePrecision: computePrecision)
- if computePrecision == .Float32 {
- super.init(device: device, inFunctionName: "softmax")
- } else if computePrecision == .Float16 {
- super.init(device: device, inFunctionName: "softmax_half")
- } else {
- fatalError()
- }
- }
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
index 3cb71ca21b425a76a6395351531ea7e81edcbbf8..0159dcf3e008ed4fe5bfd083674084fb1fb9e978 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
@@ -429,7 +429,122 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array in
}
+kernel void conv_add_5x1_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+ const uint kernelHXW = 5;
+
+ uint input_arr_size = inTexture.get_array_size();
+
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ ushort dilation_y = param.dilationY;
+ half4 input[5];
+
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+
+ input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+ input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+
+ for (int j = 0; j < 5; ++j) {
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(input[j], weight_x);
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(input[j], weight_y);
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(input[j], weight_z);
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(float4(input[j]), float4(weight_w));
+ }
+ }
+ output = output + float4(biase[gid.z]);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_add_1x5_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ const device half4 *biase [[buffer(2)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+ const uint kernelHXW = 5;
+
+ uint input_arr_size = inTexture.get_array_size();
+
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ ushort dilation_x = param.dilationX;
+ half4 input[5];
+
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+
+ input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+
+ input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+ input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+
+ input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+
+ for (int j = 0; j < 5; ++j) {
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(input[j], weight_x);
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(input[j], weight_y);
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(input[j], weight_z);
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(input[j], weight_w);
+ }
+ }
+ output = output + float4(biase[gid.z]);
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
kernel void test_conv_add_3x3(texture2d_array inTexture [[texture(0)]],
@@ -502,3 +617,6 @@ kernel void test_conv_add_3x3(texture2d_array inTexture [
// output = output + biase[gid.z];
outTexture.write(output, gid.xy, gid.z);
}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
index 35c2c56cace5975c5fe166b007bb695eba163325..c07515c13da54c7f8bf698f976e47f7cda6de32b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -148,4 +148,133 @@ kernel void conv_1x1(texture2d_array inTexture [[texture(
}
+kernel void conv_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ half4 input[9];
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
+ input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
+ input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
+ input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
+ input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
+ input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
+ input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
+ input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
+ for (int j = 0; j < 9; ++j) {
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.x += dot(float4(input[j]), float4(weight_x));
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.y += dot(float4(input[j]), float4(weight_y));
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.z += dot(float4(input[j]), float4(weight_z));
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+ output.w += dot(float4(input[j]), float4(weight_w));
+ }
+ }
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half *weights [[buffer(1)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+ uint output_slice = gid.z;
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 9;
+ uint weithTo = gid.z * kernelHXW * 4;
+ float4 output = float4(0.0);
+ half4 inputs[9];
+ inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
+ inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
+ inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
+ inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
+ inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
+ inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
+ inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
+ inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
+ inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
+ for (int j = 0; j < 9; ++j) {
+ half4 input = inputs[j];
+ output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+ output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+ output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+ output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+ }
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_1x1_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ uint3 gid [[thread_position_in_grid]]) {
+
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ ushort2 stride = ushort2(param.strideX, param.strideY);
+ ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ const uint kernelHXW = 1;
+
+ uint input_arr_size = inTexture.get_array_size();
+ uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+ float4 output = float4(0.0);
+
+ half4 input;
+ for (uint i = 0; i < input_arr_size; ++i) {
+ input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+ half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
+ output.x += dot(float4(input), float4(weight_x));
+
+ half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
+ output.y += dot(float4(input), float4(weight_y));
+
+ half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
+ output.z += dot(float4(input), float4(weight_z));
+
+ half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+ output.w += dot(float4(input), float4(weight_w));
+ }
+ outTexture.write(half4(output), gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
index 5c5a499fceae9061dba30307e2c4ecd3b0e89164..baf3f31157a472412bb08ccb3c803f5ec9e25d9c 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
@@ -29,11 +29,11 @@ struct MetalConvTransposeParam{
ushort dilationY;
};
-kernel void conv_transpose(texture2d_array inTexture [[texture(0)]],
- texture2d_array outTexture [[texture(1)]],
- constant MetalConvTransposeParam ¶m [[buffer(0)]],
- const device float4 *weights [[buffer(1)]],
- uint3 gid [[thread_position_in_grid]]){
+kernel void conv_transpose2x2_stride2(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvTransposeParam ¶m [[buffer(0)]],
+ const device float4 *weights [[buffer(1)]],
+ uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
@@ -41,48 +41,134 @@ kernel void conv_transpose(texture2d_array inTexture [[te
}
int input_array_size = inTexture.get_array_size();
-
- uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
-
- uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+ int kernel_index_x = gid.x % 2;
+ int kernel_index_y = gid.y % 2;
+ int kernel_index = kernel_index_y * 2 + kernel_index_x;
+ int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+ int input_x = gid.x / 2;
+ int input_y = gid.y / 2;
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ float4 output = float4(0.0);
+ for (int i = 0; i < input_array_size; ++i) {
+
+ float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+
+ float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+ float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+ float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+ float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+
+ output.x += dot(input, kernel_slice0);
+
+ output.y += dot(input, kernel_slice1);
+
+ output.z += dot(input, kernel_slice2);
+
+ output.w += dot(input, kernel_slice3);
+ }
+
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_transpose2x2_stride2_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ constant MetalConvTransposeParam ¶m [[buffer(0)]],
+ const device half4 *weights [[buffer(1)]],
+ uint3 gid [[thread_position_in_grid]]) {
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
- float4 output;
+ int input_array_size = inTexture.get_array_size();
+ int kernel_index_x = gid.x % 2;
+ int kernel_index_y = gid.y % 2;
+ int kernel_index = kernel_index_y * 2 + kernel_index_x;
+ int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+ int input_x = gid.x / 2;
+ int input_y = gid.y / 2;
- for (int w = 0; w < param.kernelW; ++w) {
- int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX;
- if (input_x < 0 || input_x >= int(inTexture.get_width())) {
- continue;
- }
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ float4 output = float4(0.0);
+ for (int i = 0; i < input_array_size; ++i) {
- for (int h = 0; h < param.kernelH; ++h) {
- int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY;
- if (input_y < 0 || input_y >= int(inTexture.get_height())) {
- continue;
- }
-
- uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
-
- for (int slice = 0; slice < input_array_size; ++slice) {
-
- float4 input;
- float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
- float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
-
- float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
-
- float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
-
- input = inTexture.sample(sample, float2(input_x, input_x), slice);
- output.x += dot(input, kernel_slice);
- output.x += dot(input, kernel_slice1);
- output.x += dot(input, kernel_slice2);
- output.x += dot(input, kernel_slice3);
- }
- }
+ half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+
+ half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+ half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+ half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+ half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+
+ output.x += dot(float4(input), float4(kernel_slice0));
+
+ output.y += dot(float4(input), float4(kernel_slice1));
+
+ output.z += dot(float4(input), float4(kernel_slice2));
+
+ output.w += dot(float4(input), float4(kernel_slice3));
}
- outTexture.write(output, gid.xy, gid.z);
+ outTexture.write(half4(output), gid.xy, gid.z);
}
+//kernel void conv_transpose(texture2d_array inTexture [[texture(0)]],
+// texture2d_array outTexture [[texture(1)]],
+// constant MetalConvTransposeParam ¶m [[buffer(0)]],
+// const device float4 *weights [[buffer(1)]],
+// uint3 gid [[thread_position_in_grid]]){
+// if (gid.x >= outTexture.get_width() ||
+// gid.y >= outTexture.get_height() ||
+// gid.z >= outTexture.get_array_size()) {
+// return;
+// }
+//
+// int input_array_size = inTexture.get_array_size();
+//
+// uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
+//
+// uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+//
+// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+//
+// float4 output;
+//
+// for (int w = 0; w < param.kernelW; ++w) {
+// int top = gid.x - w * param.dilationX + param.paddingX;
+// int input_x = top / param.strideX;
+// if (top < 0 || input_x >= int(inTexture.get_width())) {
+// continue;
+// }
+//
+// for (int h = 0; h < param.kernelH; ++h) {
+// int top_y = gid.y - h * param.dilationY + param.paddingY;
+// int input_y = top_y / param.strideY;
+// if (top_y < 0 || input_y >= int(inTexture.get_height())) {
+// continue;
+// }
+//
+// uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
+//
+// for (int slice = 0; slice < input_array_size; ++slice) {
+//
+// float4 input;
+// float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
+// float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
+//
+// float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
+//
+// float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
+//
+// input = inTexture.sample(sample, float2(input_x, input_y), slice);
+// output.x += dot(input, kernel_slice);
+// output.y += dot(input, kernel_slice1);
+// output.z += dot(input, kernel_slice2);
+// output.w += dot(input, kernel_slice3);
+// }
+// }
+// }
+//
+// outTexture.write(output, gid.xy, gid.z);
+//}
+//
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
index 5ad53a8ef94adde2de6034727b49415ba2cd17c7..cc662786de91a0c7d62aca4b0124ee68990d797e 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
@@ -18,6 +18,7 @@
using namespace metal;
struct ElementwiseAddParam {
+ int32_t unsafe_one_dim;
int32_t fast;
int32_t axis;
int32_t yoff;
@@ -36,7 +37,10 @@ kernel void elementwise_add(texture2d_array inputX [[textur
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
float4 rx, ry;
- if (pm.fast == 1) {
+ if (pm.unsafe_one_dim == 1) {
+ rx = inputX.read(gid.xy, gid.z);
+ ry = inputY.read(uint2(0, 0), gid.z);
+ } else if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z);
} else {
@@ -59,3 +63,39 @@ kernel void elementwise_add(texture2d_array inputX [[textur
float4 r = rx + ry;
outTexture.write(r, gid.xy, gid.z);
}
+
+kernel void elementwise_add_half(texture2d_array inputX [[texture(0)]],
+ texture2d_array inputY [[texture(1)]],
+ texture2d_array outTexture [[texture(2)]],
+ constant ElementwiseAddParam &pm [[buffer(0)]],
+ uint3 gid [[thread_position_in_grid]]) {
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) return;
+ half4 rx, ry;
+ if (pm.unsafe_one_dim == 1) {
+ rx = inputX.read(gid.xy, gid.z);
+ ry = inputY.read(uint2(0, 0), gid.z);
+ } else if (pm.fast == 1) {
+ rx = inputX.read(gid.xy, gid.z);
+ ry = inputY.read(gid.xy, gid.z);
+ } else {
+ rx = inputX.read(gid.xy, gid.z);
+ int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+ int32_t y_abcd[4] = {1, 1, 1, 1}, y_xyzn[4];
+ int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+ int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+ for (int n = 0; n < 4; n++) {
+ xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+ invtrans(xtrans, x_abcd, t_abcd);
+ for (int k = pm.axis; k < (4 - pm.yoff); k++) {
+ y_abcd[k+pm.yoff] = t_abcd[k];
+ }
+ trans(ytrans, y_abcd, t_abcd);
+ abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+ ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+ }
+ }
+ half4 r = rx + ry;
+ outTexture.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
index bd14a146100d0a0723f73ab5fd1f95d1f8e39c97..597804137743dd253d05d91a5008f558dcaf42e7 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
@@ -81,3 +81,71 @@ kernel void prelu_other(texture2d_array inTexture [[textu
outTexture.write(output, gid.xy, gid.z);
}
+
+kernel void prelu_channel_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ const device half4 *alpha [[buffer(0)]],
+ uint3 gid [[thread_position_in_grid]]){
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+ half4 alpha_value = alpha[gid.z];
+ half4 output;
+ output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+ output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+ output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+ output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_element_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ const device half4 *alpha [[buffer(0)]],
+ uint3 gid [[thread_position_in_grid]]){
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+
+ int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+ half4 alpha_value = alpha[alpha_to + gid.z];
+
+ half4 output;
+ output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+ output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+ output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+ output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_other_half(texture2d_array inTexture [[texture(0)]],
+ texture2d_array outTexture [[texture(1)]],
+ const device half *alpha [[buffer(0)]],
+ uint3 gid [[thread_position_in_grid]]){
+ if (gid.x >= outTexture.get_width() ||
+ gid.y >= outTexture.get_height() ||
+ gid.z >= outTexture.get_array_size()) {
+ return;
+ }
+
+ constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+ half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+ half alpha_value = alpha[0];
+ half4 output;
+ output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+ output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+ output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+ output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+ outTexture.write(output, gid.xy, gid.z);
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
index d3d31cfcd43a81f6068238d45f3442af0fc7795f..6f42f2aa9f8d0515946ace625ed16c5040fd3099 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
@@ -60,7 +60,7 @@ class PoolOp: Operator, PoolParam>, Runable,
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
// print("pool2d delog")
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
index c7e049e3c1b21d9747acca8812abfff8c25d6d98..52cab3425f91e9fdfa2f5932d32eb7cc3947f6ab 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
@@ -51,13 +51,13 @@ class PreluOp: Operator, PreluParam>, Runabl
func delogOutput() {
print(" \(type) input: ")
- print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
print(" \(type) Alpha: ")
let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
}
// print("softmax delog")
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
index d48fc4cd1cc4bfef9d752ea999cb39b61cb02941..4a27fba9839af5feb709a69a76529c60928a981e 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -76,12 +76,12 @@ class PriorBoxOp: Operator, PriorBoxParam
print(outputArray)
// output
// print(" \(type) output: ")
-// let originDim = para.output.originDim
+// let padToFourDim = para.output.padToFourDim
// if para.output.transpose == [0, 1, 2, 3] {
-// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
// print(outputArray.strideArray())
// } else if para.output.transpose == [0, 2, 3, 1] {
-// print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray())
+// print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
// } else {
// print(" not implement")
// }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
index 0325f860e078cf639c08e279970a105e3f562a32..c9f054c88af44ac3f5dd453b4696c7988d01fa8f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
@@ -46,7 +46,7 @@ class ReluOp: Operator, ReluParam>, Runable,
func delogOutput() {
print(" \(type) output: ")
- print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
+ print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
index 451b064ce19e0e1cb70700d046b6ab059e6df9e3..b37026d6a81c6eed4493953a84afc05d4a6b980f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -41,8 +41,8 @@ class ReshapeParam: OpParam {
for i in 0..: Operator, ReshapeParam>,
print("reshape delog")
// let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false)
- let originDim = para.output.originDim
+ let padToFourDim = para.output.padToFourDim
- let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray())
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
index af776450d7f15d031a6af546d13bc1c0374249b7..66b5c7b3146d4c433e12b846a971e4b5ae579f79 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
@@ -26,7 +26,7 @@ class SoftmaxParam: OpParam {
output.dim = input.dim
output.tensorDim = input.tensorDim
- output.originDim = input.originDim
+ output.padToFourDim = input.padToFourDim
} catch let error {
throw error
}
@@ -52,9 +52,11 @@ class SoftmaxOp: Operator, SoftmaxParam>,
func delogOutput() {
print("softmax delog")
-
- let originDim = para.output.originDim
- let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+ print(para.input)
+
+ print(para.output)
+ let padToFourDim = para.output.padToFourDim
+ let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray())
}
}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
index 8281ba543365410f49eaad1f4d992f54af30531d..0213b52bf16fa498835729c5b7e3a65600f7669d 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
@@ -48,9 +48,9 @@ class TransposeOp: Operator, TransposeParam
func delogOutput() {
print(" \(type) output: ")
- let originDim = para.output.originDim
+ let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] {
- let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+ let outputArray = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
index a318180b2b57b162715f0088fdfd88767506ad2a..c5ee1414521e7eb92011d4f4b608ad326b005531 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -95,7 +95,28 @@ class Tensor: Tensorial {
- func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16) {
+ func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) {
+ if convertToNHWC {
+// print(layout)
+ convert(to: DataLayout.NHWC())
+ }
+
+ if withTranspose {
+ let transposePointer = UnsafeMutablePointer.allocate(capacity: numel())
+ let n = dim[0]
+ let hwc = numel()/n
+ for j in 0.. else {
fatalError(" not support yet ")
}
@@ -139,6 +160,8 @@ class Tensor: Tensorial {
for j in 0..: Tensorial {
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
}
+ convertedPointer.deinitialize(count: count)
+ convertedPointer.deallocate()
+ }
+ } else {
+ let C = dim[3]
+ let cSlices = (C + 3) / 4
+ let paddedC = cSlices * 4
+ let count = paddedC * dim[0] * dim[1] * dim[2]
+ if C == paddedC {
+ buffer = device.makeBuffer(length: count * precisionSize)
+ switch precision {
+ case .Float32:
+ buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout.stride)
+ case .Float16:
+ float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+ }
+ } else if C == 1 {
+ fatalError(" not support ")
+ } else {
+ buffer = device.makeBuffer(length: count * precisionSize)
+ let convertedPointer = UnsafeMutablePointer.allocate(capacity: count)
+ var tmpPointer = floatPointer
+ var dstPtr = convertedPointer
+ for _ in 0...stride)
+ case .Float16:
+ float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+ }
convertedPointer.deinitialize(count: count)
convertedPointer.deallocate()
}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
index 42a381c50cd3a66f39d9e19e9cffc8de83ca48b2..d3beb92a5fa30dfae5ddbbcd8bb7563b791771e4 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -41,14 +41,28 @@ extension InputTexture {
public class Texture: Tensorial {
var dim: Dim
public var tensorDim: Dim
- public var originDim: Dim
+ public var padToFourDim: Dim
private var textureDesc: MTLTextureDescriptor!
public var metalTexture: MTLTexture!
var transpose: [Int] = [0, 1, 2, 3]
+ func toTensor() -> [Float32] {
+ guard padToFourDim.cout() == 4 else {
+ fatalError("- not support -")
+ }
+ return metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]))
+ }
+
+ func realNHWC() -> [Float32] {
+ guard padToFourDim.cout() == 4 else {
+ fatalError(" - not support - ")
+ }
+ return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+ }
+
func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
transpose = inTranspose
- let newDim = transpose.map { originDim[$0] }
+ let newDim = transpose.map { padToFourDim[$0] }
let newLayout = transpose.map { layout.layoutWithDim[$0] }
@@ -93,7 +107,7 @@ public class Texture: Tensorial {
}
tensorDim = inDim
dim = fourDim
- originDim = fourDim
+ padToFourDim = fourDim
layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
}