提交 45c202e5 编写于 作者: D dolphin8 提交者: GitHub

Merge pull request #970 from dolphin8/metal

Metal
......@@ -16,9 +16,13 @@
4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA91214665D700D0F791 /* ShapeOp.swift */; };
4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA932146661500D0F791 /* ShapeKernel.swift */; };
4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA972146666500D0F791 /* FlattenOp.swift */; };
4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */ = {isa = PBXBuildFile; fileRef = 4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */; };
4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */; };
4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */; };
4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */; };
4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928762133F1DB005B6C3A /* BoxCoder.metal */; };
4AF9287921341661005B6C3A /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9287821341661005B6C3A /* Softmax.metal */; };
4AF928822135673D005B6C3A /* Concat.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* Concat.metal */; };
4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* ConcatKernel.metal */; };
4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9288321357BE3005B6C3A /* Elementwise.metal */; };
D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */; };
FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226552138F33800F395E2 /* TransposeKernel.metal */; };
......@@ -124,9 +128,13 @@
4AA1EA91214665D700D0F791 /* ShapeOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeOp.swift; sourceTree = "<group>"; };
4AA1EA932146661500D0F791 /* ShapeKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeKernel.swift; sourceTree = "<group>"; };
4AA1EA972146666500D0F791 /* FlattenOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenOp.swift; sourceTree = "<group>"; };
4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ConcatKernel.inc.metal; sourceTree = "<group>"; };
4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ReshapeKernel.inc.metal; sourceTree = "<group>"; };
4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenKernel.swift; sourceTree = "<group>"; };
4AA1EAA3214A295C00D0F791 /* Split.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.inc.metal; sourceTree = "<group>"; };
4AF928762133F1DB005B6C3A /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; };
4AF9287821341661005B6C3A /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = "<group>"; };
4AF928812135673D005B6C3A /* Concat.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Concat.metal; sourceTree = "<group>"; };
4AF928812135673D005B6C3A /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = "<group>"; };
4AF9288321357BE3005B6C3A /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.debug.xcconfig"; sourceTree = "<group>"; };
DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
......@@ -391,6 +399,7 @@
FCD04E6720F315020007374F /* PoolKernel.swift */,
FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */,
FCD04E6F20F31B720007374F /* ReshapeKernel.swift */,
4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */,
FCD04E7320F3437E0007374F /* ConvAddKernel.swift */,
FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */,
FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */,
......@@ -437,12 +446,14 @@
isa = PBXGroup;
children = (
FC27990D21341016000B6BAD /* BoxCoder.metal */,
4AF928812135673D005B6C3A /* Concat.metal */,
4AF928812135673D005B6C3A /* ConcatKernel.metal */,
4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */,
4AF9288321357BE3005B6C3A /* Elementwise.metal */,
FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
4AF928762133F1DB005B6C3A /* BoxCoder.metal */,
4AA1EA8F214664CD00D0F791 /* Split.metal */,
4AA1EAA3214A295C00D0F791 /* Split.inc.metal */,
4AA1EA892146631C00D0F791 /* BilinearInterp.metal */,
4AF9287821341661005B6C3A /* Softmax.metal */,
FCEB6849212F00DB00D2448E /* PreluKernel.metal */,
......@@ -450,6 +461,7 @@
FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */,
FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */,
FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */,
4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */,
FCA3A1642132A5EB00084FE5 /* Common.metal */,
FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */,
FCA67CD42138272900BD58AA /* ConvAddMetal.metal */,
......@@ -471,6 +483,7 @@
FC4FD9792140E4980073E130 /* PaddleMobile.h in Headers */,
FC292C85214257CB00CF622F /* CPUCompute.h in Headers */,
FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */,
4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */,
FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */,
);
runOnlyForDeploymentPostprocessing = 0;
......@@ -574,6 +587,7 @@
FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */,
FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */,
FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */,
FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
......@@ -610,6 +624,7 @@
FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
FC9D038420E23B01000F735A /* Texture.swift in Sources */,
4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */,
4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */,
FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */,
......@@ -624,7 +639,7 @@
FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */,
FC292C872142624800CF622F /* Genet.swift in Sources */,
4AF928822135673D005B6C3A /* Concat.metal in Sources */,
4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */,
FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */,
FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */,
......@@ -650,6 +665,7 @@
FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */,
4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */,
FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */,
FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */,
FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
......
......@@ -83,38 +83,38 @@ public class PaddleMobileUnitTest {
}
public func testConcat() {
let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
var it: [[Float32]] = []
for _ in 0..<7 {
it.append((0..<12).map { Float32($0) })
}
let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
let param = ConcatTestParam.init(
input: input,
output: output,
dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
axis: 1,
odim: [3, 28]
)
let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
concatKernel.test(cmdBuffer: buffer, param: param)
buffer.addCompletedHandler { (buffer) in
for i in 0..<it.count {
let _: Float32? = input[i].logDesc()
self.tensorPrint(tensor: it[i], dim: [3, 4])
}
let _: Float32? = output.logDesc()
let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
self.tensorPrint(tensor: tx, dim: [3, 28])
}
buffer.commit()
// let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
// var it: [[Float32]] = []
// for _ in 0..<7 {
// it.append((0..<12).map { Float32($0) })
// }
// let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
// let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
//
// let param = ConcatTestParam.init(
// input: input,
// output: output,
// dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
// axis: 1,
// odim: [3, 28]
// )
// let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
// concatKernel.test(cmdBuffer: buffer, param: param)
// buffer.addCompletedHandler { (buffer) in
// for i in 0..<it.count {
// let _: Float32? = input[i].logDesc()
// self.tensorPrint(tensor: it[i], dim: [3, 4])
// }
// let _: Float32? = output.logDesc()
// let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
// self.tensorPrint(tensor: tx, dim: [3, 28])
// }
//
// buffer.commit()
}
public func testReshape() {
let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
// let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
// let input: [Float32] = (0..<24).map { Float32($0) }
// let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
// let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
......@@ -139,32 +139,32 @@ public class PaddleMobileUnitTest {
// self.tensorPrint(tensor: tx, dim: [4, 6])
// }
let input: [Float32] = (0..<24).map { Float32($0) }
let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
let mp = ReshapeMetalParam.init(
idim: (1, 2, 3, 4),
itrans: (0, 1, 2, 3),
odim: (1, 1, 1, 24),
otrans: (0, 1, 2, 3)
)
let param = ReshapeTestParam.init(
inputTexture: inTexture,
outputTexture: outTexture,
param: mp
)
let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
reshapeKernel.test(commandBuffer: buffer, testParam: param)
buffer.addCompletedHandler { (buffer) in
let _: Float32? = inTexture.logDesc()
let _: Float32? = outTexture.logDesc()
self.tensorPrint(tensor: input, dim: [2, 3, 4])
let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
self.tensorPrint(tensor: tx, dim: [24])
}
buffer.commit()
// let input: [Float32] = (0..<24).map { Float32($0) }
// let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
// let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
// let mp = ReshapeMetalParam.init(
// idim: (1, 2, 3, 4),
// itrans: (0, 1, 2, 3),
// odim: (1, 1, 1, 24),
// otrans: (0, 1, 2, 3)
// )
// let param = ReshapeTestParam.init(
// inputTexture: inTexture,
// outputTexture: outTexture,
// param: mp
// )
// let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
// reshapeKernel.test(commandBuffer: buffer, testParam: param)
// buffer.addCompletedHandler { (buffer) in
// let _: Float32? = inTexture.logDesc()
// let _: Float32? = outTexture.logDesc()
// self.tensorPrint(tensor: input, dim: [2, 3, 4])
// let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
// self.tensorPrint(tensor: tx, dim: [24])
// }
//
//
// buffer.commit()
}
public func testTranspose() {
......
......@@ -30,7 +30,7 @@ public class MobileNet_ssd_AR: Net{
class MobilenetssdPreProccess: CusomKernel {
init(device: MTLDevice) {
let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
super.init(device: device, inFunctionName: "mobilent_ar_preprocess_half", outputDim: s, usePaddleMobileLib: false)
super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
}
}
......
......@@ -19,11 +19,14 @@ class BatchNormParam<P: PrecisionType>: OpParam {
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
if input.transpose != [0, 2, 3, 1] {
fatalError("batch norm only accepts NHWC")
}
output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
inputBias = try BatchNormParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
inputMean = try BatchNormParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
inputScale = try BatchNormParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
inputVariance = try BatchNormParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
} catch let error {
......@@ -32,10 +35,10 @@ class BatchNormParam<P: PrecisionType>: OpParam {
}
let input: Texture<P>
var output: Texture<P>
let inputBias: Tensor<ParamPrecisionType>
let inputMean: Tensor<ParamPrecisionType>
let inputScale: Tensor<ParamPrecisionType>
let inputVariance: Tensor<ParamPrecisionType>
let bias: Tensor<P>
let mean: Tensor<P>
let scale: Tensor<P>
let variance: Tensor<P>
let epsilon: Float
let momentum: Float
}
......
......@@ -19,15 +19,15 @@ class BilinearInterpParam<P: PrecisionType>: OpParam {
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
// if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
// fatalError()
// }
output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
} catch let error {
throw error
}
if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
fatalError()
}
}
let input: Texture<P>
var output: Texture<P>
......@@ -53,6 +53,15 @@ class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, Bili
func delogOutput() {
print(" \(type) output: ")
let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
} else {
fatalError(" not implemet")
}
}
}
......
......@@ -14,7 +14,24 @@
import Foundation
class FlattenOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
class FlattenParam<P: PrecisionType>: OpParam {
typealias ParamPrecisionType = P
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
} catch let error {
throw error
}
}
let input: Texture<P>
var output: Texture<P>
let axis: Int
}
class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
typealias OpType = FlattenOp<P>
......
......@@ -15,20 +15,20 @@
import Foundation
class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
// var newScale: MTLBuffer
// var newBias: MTLBuffer
//
required init(device: MTLDevice, param: BatchNormParam<P>) {
// guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
// fatalError()
// }
//
// guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
// fatalError()
// }
// self.newScale = newScale
// self.newBias = newBias
//
let count = param.variance.dim.numel()
let varianceP = param.variance.data.pointer
let meanP = param.mean.data.pointer
let scaleP = param.scale.data.pointer
let biasP = param.scale.data.pointer
for i in 0..<count {
let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
scaleP[i] = invStd * scaleP[i]
}
param.bias.initBuffer(device: device, precision: computePrecision)
param.scale.initBuffer(device: device, precision: computePrecision)
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "batchnorm")
} else if computePrecision == .Float16 {
......@@ -36,37 +36,16 @@ class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
} else {
fatalError()
}
//
// let varianceBuffer : MTLBuffer = param.inputVariance.buffer
//
// var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
// let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
// for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
// invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
// }
//
// let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
// let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
// let scale : MTLBuffer = param.inputScale.buffer
// let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
// let bias : MTLBuffer = param.inputBias.buffer
// let biasContents = bias.contents().assumingMemoryBound(to: P.self)
// let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
//
// for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
// newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
// newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
// }
}
func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
}
// encoder.setTexture(param.input.metalTexture, index: 0)
// encoder.setTexture(param.output.metalTexture, index: 1)
// encoder.setBuffer(newScale, offset: 0, index: 0)
// encoder.setBuffer(newBias, offset: 0, index: 1)
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
......
......@@ -31,101 +31,111 @@ struct ConcatMetalParam {
}
class ConcatKernel<P: PrecisionType>: Kernel, Computable{
func encodeTest(_ cmdBuffer: MTLCommandBuffer, _ param: ConcatTestParam, _ istart: Int, _ iend: Int) {
let encoder = cmdBuffer.makeComputeCommandEncoder()!
var p = ConcatMetalParam.init()
var odim: [Int32] = [1, 1, 1, 1]
for i in 0..<param.odim.count {
odim[4-param.odim.count+i] = Int32(param.odim[i])
}
p.odim = (odim[0], odim[1], odim[2], odim[3])
p.axis = Int32(4 - param.odim.count + param.axis)
for i in 0..<istart {
p.offset += Int32(param.dims[i][param.axis])
var v = "normal"
var pm = ConcatMetalParam.init()
func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
var vdim: [Int32] = []
for i in 0..<(iend - istart) {
encoder.setTexture(param.input[i+istart], index: i)
vdim.append(Int32(param.dims[i+istart][Int(param.axis)]))
let num = param.input.count
for i in 0..<num {
encoder.setTexture(param.input[i].metalTexture, index: i)
}
for i in (iend-istart)..<6 {
encoder.setTexture(param.input[0], index: i)
vdim.append(0)
encoder.setTexture(param.output.metalTexture, index: num)
if v == "normal" {
encoder.setTexture(param.output.metalTexture, index: num + 1)
}
p.vdim = (vdim[0], vdim[1], vdim[2], vdim[3], vdim[4], vdim[5])
encoder.setTexture(param.output, index: 6)
encoder.setTexture(param.output, index: 7)
encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output)
encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
func encode(_ cmdBuffer: MTLCommandBuffer, _ param: ConcatParam<P>, _ istart: Int, _ iend: Int) throws {
guard let encoder = cmdBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
var p = ConcatMetalParam.init()
let odim = (0..<4).map { Int32(param.output.dim[$0]) }
p.odim = (odim[0], odim[1], odim[2], odim[3])
p.axis = Int32(4 - param.output.tensorDim.cout() + param.axis)
required init(device: MTLDevice, param: ConcatParam<P>) {
param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
let orank = param.output.tensorDim.cout()
let num = param.input.count
assert(num <= 6)
var axis = 4 - param.output.tensorDim.cout() + param.axis
for i in 0..<4 {
if Int32(param.transpose[i]) == p.axis {
p.axis = Int32(i)
if param.transpose[i] == axis {
axis = i
break
}
}
for i in 0..<istart {
p.offset += Int32(param.input[i+istart].dim[Int(p.axis)])
}
var vdim: [Int32] = []
for i in 0..<(iend - istart) {
encoder.setTexture(param.input[i+istart].metalTexture, index: i)
vdim.append(Int32(param.input[i+istart].dim[Int(p.axis)]))
}
for i in (iend-istart)..<6 {
encoder.setTexture(param.input[0].metalTexture, index: i)
vdim.append(0)
}
p.trans = (Int32(param.transpose[0]), Int32(param.transpose[1]), Int32(param.transpose[2]), Int32(param.transpose[3]))
p.vdim = (vdim[0], vdim[1], vdim[2], vdim[3], vdim[4], vdim[5])
encoder.setTexture(param.output.metalTexture, index: 6)
encoder.setTexture(param.output.metalTexture, index: 7)
encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
let group = param.input.count / 6
let remain = param.input.count % 6
for i in 0..<group {
try self.encode(commandBuffer, param, 6 * i, 6 * (i + 1))
}
if remain > 0 {
try self.encode(commandBuffer, param, 6 * group, param.input.count)
}
}
func test(cmdBuffer: MTLCommandBuffer, param: ConcatTestParam) {
let group = param.input.count / 6
let remain = param.input.count % 6
for i in 0..<group {
self.encodeTest(cmdBuffer, param, 6 * i, 6 * (i + 1))
pm.axis = Int32(axis)
pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
var vdim: [Int] = [0, 0, 0, 0, 0, 0]
for i in 0..<num {
vdim[i] = param.input[i].dim[axis]
}
if remain > 0 {
self.encodeTest(cmdBuffer, param, 6 * group, param.input.count)
if orank == 4 {
if axis == 1 {
v = "y"
} else if axis == 2 {
v = "x"
} else {
if (param.output.dim[0] == 1) && axis == 3 {
var vz = true
for i in 0..<num {
if vdim[i] % 4 != 0 {
vz = false
break
}
}
if vz {
v = "z"
for i in 0..<num {
vdim[i] = vdim[i] / 4
}
}
}
}
} else if orank == 3 {
if axis == 2 {
v = "y"
} else if axis == 3 {
v = "x"
} else if axis == 1 {
var vz = true
for i in 0..<num {
if vdim[i] % 4 != 0 {
vz = false
break
}
}
if vz {
v = "z"
for i in 0..<num {
vdim[i] = vdim[i] / 4
}
}
}
} else {
if axis == 2 {
v = "y"
} else if axis == 3 {
var vx = true
for i in 0..<num {
if vdim[i] % 4 != 0 {
vx = false
break
}
}
if vx {
v = "x"
for i in 0..<num {
vdim[i] = vdim[i] / 4
}
}
}
}
}
required init(device: MTLDevice, param: ConcatParam<P>) {
param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "concat")
super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "concat_half")
super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half")
} else {
fatalError()
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
import Foundation
struct FlattenMetalParam {
var idim: (Int32, Int32, Int32, Int32)
var itrans: (Int32, Int32, Int32, Int32)
var odim: (Int32, Int32, Int32, Int32)
var otrans: (Int32, Int32, Int32, Int32)
}
class FlattenKernel<P: PrecisionType>: Kernel, Computable{
var metalParam: FlattenMetalParam
required init(device: MTLDevice, param: FlattenParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
var id: [Int32] = [1, 1, 1, 1]
for i in 0..<param.input.tensorDim.cout() {
id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
}
let it: [Int32] = param.input.transpose.map { Int32($0) }
var od: [Int32] = [1, 1, 1, 1]
for i in 0..<param.output.tensorDim.cout() {
od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
}
let ot: [Int32] = param.output.transpose.map { Int32($0) }
metalParam = FlattenMetalParam.init(
idim: (id[0], id[1], id[2], id[3]),
itrans: (it[0], it[1], it[2], it[3]),
odim: (od[0], od[1], od[2], od[3]),
otrans: (ot[0], ot[1], ot[2], ot[3])
)
let irank = param.input.tensorDim.cout()
let orank = param.output.tensorDim.cout()
assert(orank == 2)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "reshape_\(irank)_2_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "reshape_\(irank)_2_half")
} else {
fatalError()
}
}
func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
}
......@@ -49,10 +49,12 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
odim: (od[0], od[1], od[2], od[3]),
otrans: (ot[0], ot[1], ot[2], ot[3])
)
let irank = param.input.tensorDim.cout()
let orank = param.output.tensorDim.cout()
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "reshape")
super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "reshape_half")
super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half")
} else {
fatalError()
}
......@@ -69,10 +71,11 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
}
func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
print("reshape compute")
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
......@@ -81,15 +84,15 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
encoder.endEncoding()
}
func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
fatalError()
}
encoder.setTexture(testParam.inputTexture, index: 0)
encoder.setTexture(testParam.outputTexture, index: 1)
var pm: ReshapeMetalParam = testParam.param
encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
encoder.endEncoding()
}
// func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
// fatalError()
// }
// encoder.setTexture(testParam.inputTexture, index: 0)
// encoder.setTexture(testParam.outputTexture, index: 1)
// var pm: ReshapeMetalParam = testParam.param
// encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
// encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
// encoder.endEncoding()
// }
}
......@@ -19,11 +19,12 @@ struct ShapeMetalParam {
class ShapeKernel<P: PrecisionType>: Kernel, Computable{
func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.output.metalTexture, index: 0)
encoder.endEncoding()
print("shape compute")
// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
// throw PaddleMobileError.predictError(message: " encode is nil")
// }
// encoder.setTexture(param.output.metalTexture, index: 0)
// encoder.endEncoding()
}
required init(device: MTLDevice, param: ShapeParam<P>) {
......
......@@ -15,23 +15,76 @@
import Foundation
struct SplitMetalParam {
var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
var axis: Int32 = 0
var offset: Int32 = 0
var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
}
class SplitKernel<P: PrecisionType>: Kernel, Computable{
var smp: SplitMetalParam
func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.output.metalTexture, index: 0)
encoder.setTexture(param.input.metalTexture, index: 0)
for i in 0..<param.outputList.count {
encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
}
encoder.setBytes(&smp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
encoder.endEncoding()
}
required init(device: MTLDevice, param: SplitParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
// param.output.initTexture(device: device, computePrecision: computePrecision)
let num = param.outputList.count
let rank = param.input.tensorDim.cout()
assert(num >= 2 && num <= 4)
for output in param.outputList {
output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
}
smp = SplitMetalParam.init()
smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
for i in 0..<4 {
if param.input.transpose[i] == smp.axis {
smp.axis = Int32(i)
break
}
}
smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
var vdim: [Int32] = [0, 0, 0, 0]
for i in 0..<num {
vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
}
smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
var v = "normal"
if rank == 4 {
if smp.axis == 1 {
v = "y"
} else if smp.axis == 2 {
v = "x"
}
} else if rank == 3 {
if smp.axis == 2 {
v = "y"
} else if smp.axis == 3 {
v = "x"
}
} else if rank == 2 {
if smp.axis == 2 {
v = "y"
}
}
if v == "normal" {
fatalError("split unsupported")
}
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "split")
super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "split_half")
super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half")
} else {
fatalError()
}
......
......@@ -15,28 +15,28 @@
#include <metal_stdlib>
using namespace metal;
kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 * newScale [[buffer(0)]],
const device half4 * newBias [[buffer(1)]],
kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
const device float4 * newScale [[buffer(0)]],
const device float4 * newBias [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
const half4 input = inTexture.read(gid.xy, gid.z);
half4 output = input * newScale[gid.z] + newBias[gid.z];
const float4 input = inTexture.read(gid.xy, gid.z);
float4 output = input * newScale[gid.z] + newBias[gid.z];
outTexture.write(output, gid.xy, gid.z);
}
kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
const device float4 * newScale [[buffer(0)]],
const device float4 * newBias [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 * newScale [[buffer(0)]],
const device half4 * newBias [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
const float4 input = inTexture.read(gid.xy, gid.z);
float4 output = input * newScale[gid.z] + newBias[gid.z];
const half4 input = inTexture.read(gid.xy, gid.z);
half4 output = input * newScale[gid.z] + newBias[gid.z];
outTexture.write(output, gid.xy, gid.z);
}
......@@ -23,7 +23,7 @@ struct bilinear_interp_param {
};
kernel void bilinear_interp(texture2d_array<float, access::read> input [[texture(0)]],
texture2d_array<float, access::write> output [[texture(2)]],
texture2d_array<float, access::write> output [[texture(1)]],
constant bilinear_interp_param & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
float4 r;
......@@ -47,29 +47,29 @@ kernel void bilinear_interp(texture2d_array<float, access::read> input [[texture
output.write(r, gid.xy, gid.z);
}
kernel void bilinear_interp_half(texture2d_array<half, access::read> input [[texture(0)]],
texture2d_array<half, access::write> output [[texture(2)]],
constant bilinear_interp_param & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
half4 r;
if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
r = input.read(gid.xy, gid.z);
} else {
half w = gid.x * pm.ratio_w;
half h = gid.y * pm.ratio_h;
uint w0 = w, h0 = h;
uint w1 = w0 + 1, h1 = h0 + 1;
half w1lambda = w - w0, h1lambda = h - h0;
half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
if (w1 >= input.get_width()) w1 = w0;
if (h1 >= input.get_height()) h1 = h0;
half4 r0 = input.read(uint2(w0, h0), gid.z);
half4 r1 = input.read(uint2(w1, h0), gid.z);
half4 r2 = input.read(uint2(w0, h1), gid.z);
half4 r3 = input.read(uint2(w1, h1), gid.z);
r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
}
output.write(r, gid.xy, gid.z);
output.write(r, gid.xy, gid.z);
}
//kernel void bilinear_interp_half(texture2d_array<half, access::read> input [[texture(0)]],
// texture2d_array<half, access::write> output [[texture(1)]],
// constant bilinear_interp_param & pm [[buffer(0)]],
// uint3 gid [[thread_position_in_grid]]) {
//
// half4 r;
// if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
// r = input.read(gid.xy, gid.z);
// } else {
// half w = gid.x * pm.ratio_w;
// half h = gid.y * pm.ratio_h;
// uint w0 = w, h0 = h;
// uint w1 = w0 + 1, h1 = h0 + 1;
// half w1lambda = w - w0, h1lambda = h - h0;
// half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
// if (w1 >= input.get_width()) w1 = w0;
// if (h1 >= input.get_height()) h1 = h0;
// half4 r0 = input.read(uint2(w0, h0), gid.z);
// half4 r1 = input.read(uint2(w1, h0), gid.z);
// half4 r2 = input.read(uint2(w0, h1), gid.z);
// half4 r3 = input.read(uint2(w1, h1), gid.z);
// r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
// }
// output.write(r, gid.xy, gid.z);
// output.write(r, gid.xy, gid.z);
//}
......@@ -15,6 +15,55 @@
#include <metal_stdlib>
using namespace metal;
inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
abcd[0] = abcd[1] = abcd[2] = 1;
abcd[3] = xyzn[0] * 4 + xyzn[3];
}
inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
abcd[0] = abcd[1] = 1;
abcd[2] = xyzn[1];
abcd[3] = xyzn[0] * 4 + xyzn[3];
}
inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
abcd[0] = 1;
abcd[3] = xyzn[0];
abcd[2] = xyzn[1];
abcd[1] = xyzn[2] * 4 + xyzn[3];
}
inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
abcd[2] = xyzn[0];
abcd[1] = xyzn[1];
uint t = xyzn[2] * 4 + xyzn[3];
abcd[0] = t / C;
abcd[3] = t % C;
}
inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
xyzn[1] = xyzn[2] = 1;
xyzn[0] = abcd[3] / 4;
xyzn[1] = abcd[3] % 4;
}
inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
xyzn[2] = 1;
xyzn[1] = abcd[2];
xyzn[0] = abcd[3] / 4;
xyzn[1] = abcd[3] % 4;
}
inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
xyzn[0] = abcd[3];
xyzn[1] = abcd[2];
xyzn[2] = abcd[1] / 4;
xyzn[3] = abcd[1] % 4;
}
inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
xyzn[0] = abcd[2];
xyzn[1] = abcd[1];
uint t = abcd[0] * C + abcd[3];
xyzn[2] = t / 4;
xyzn[3] = t % 4;
}
inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
abcd[2] = xyzn[0];
abcd[1] = xyzn[1];
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
struct ConcatParam {
int32_t odim[4];
int32_t axis;
int32_t offset;
int32_t trans[4];
int32_t vdim[6];
};
kernel void concat(texture2d_array<float, access::read> in0 [[texture(0)]],
texture2d_array<float, access::read> in1 [[texture(1)]],
texture2d_array<float, access::read> in2 [[texture(2)]],
texture2d_array<float, access::read> in3 [[texture(3)]],
texture2d_array<float, access::read> in4 [[texture(4)]],
texture2d_array<float, access::read> in5 [[texture(5)]],
texture2d_array<float, access::read> inx [[texture(6)]],
texture2d_array<float, access::write> out [[texture(7)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
ConcatParam cp = pm;
int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
float4 r;
for (int i = 0; i < 4; i++) {
xyzn[3] = i;
xyzn2abcd(cp.odim[3], xyzn, abcd);
int k = abcd[cp.axis] - cp.offset;
int j = 0;
if (k < 0) {
r[i] = inx.read(gid.xy, gid.z)[i];
} else {
for (; j < 6; j++) {
if (k < cp.vdim[j]) {
break;
}
k -= cp.vdim[j];
}
int ta = cp.odim[cp.axis];
abcd[cp.axis] = k;
cp.odim[cp.axis] = cp.vdim[j];
abcd2xyzn(cp.odim[3], abcd, oxyzn);
cp.odim[cp.axis] = ta;
switch (j) {
case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
}
}
}
out.write(r, gid.xy, gid.z);
}
kernel void concat_half(texture2d_array<half, access::read> in0 [[texture(0)]],
texture2d_array<half, access::read> in1 [[texture(1)]],
texture2d_array<half, access::read> in2 [[texture(2)]],
texture2d_array<half, access::read> in3 [[texture(3)]],
texture2d_array<half, access::read> in4 [[texture(4)]],
texture2d_array<half, access::read> in5 [[texture(5)]],
texture2d_array<half, access::read> inx [[texture(6)]],
texture2d_array<half, access::write> out [[texture(7)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
ConcatParam cp = pm;
int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
half4 r;
for (int i = 0; i < 4; i++) {
xyzn[3] = i;
xyzn2abcd(cp.odim[3], xyzn, abcd);
int k = abcd[cp.axis] - cp.offset;
int j = 0;
if (k < 0) {
r[i] = inx.read(gid.xy, gid.z)[i];
} else {
for (; j < 6; j++) {
if (k < cp.vdim[j]) {
break;
}
k -= cp.vdim[j];
}
int ta = cp.odim[cp.axis];
abcd[cp.axis] = k;
cp.odim[cp.axis] = cp.vdim[j];
abcd2xyzn(cp.odim[3], abcd, oxyzn);
cp.odim[cp.axis] = ta;
switch (j) {
case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
}
}
}
out.write(r, gid.xy, gid.z);
}
#ifdef P
#define CONCAT2(a, b) a ## b
#define CONCAT2_(a, b) a ## _ ## b
#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
#define VECTOR(p, n) CONCAT2(p, n)
#define FUNC_R(f, r) CONCAT2_(f, r)
#if V == VX
#define VV x
#elif V == VY
#define VV y
#elif V == VZ
#define VV z
#else
#define VV normal
#endif
#if V == VNORMAL
//kernel void FUNC(concat, R, N, normal, P)(array<texture2d_array<P, access::read>, N> in [[texture(0)]],
// texture2d_array<P, access::read> out_x [[texture(N)]],
// texture2d_array<P, access::write> out [[texture(N+1)]],
// constant ConcatParam & pm [[buffer(0)]],
// uint3 gid [[thread_position_in_grid]]) {
//}
kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
texture2d_array<P, access::read> in1 [[texture(1)]],
#if N >= 3
texture2d_array<P, access::read> in2 [[texture(2)]],
#endif
#if N >= 4
texture2d_array<P, access::read> in3 [[texture(3)]],
#endif
#if N >= 5
texture2d_array<P, access::read> in4 [[texture(4)]],
#endif
#if N >= 6
texture2d_array<P, access::read> in5 [[texture(5)]],
#endif
texture2d_array<P, access::read> inx [[texture(N)]],
texture2d_array<P, access::write> out [[texture(N+1)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
ConcatParam cp = pm;
int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
for (int i = 0; i < 4; i++) {
xyzn[3] = i;
#if R == 4
xyzn2abcd_4(cp.odim[3], xyzn, abcd);
#else
FUNC_R(xyzn2abcd, R)(xyzn, abcd);
#endif
int k = abcd[cp.axis] - cp.offset;
if (k < 0) continue;
int j = 0;
for (; j < N; j++) {
if (k < cp.vdim[j]) {
break;
}
k -= cp.vdim[j];
}
if (k > cp.vdim[N-1]) {
continue;
}
int ta = cp.odim[cp.axis];
abcd[cp.axis] = k;
cp.odim[cp.axis] = cp.vdim[j];
#if R == 4
abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
#else
FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
#endif
cp.odim[cp.axis] = ta;
switch (j) {
case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
#if N >= 3
case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
#endif
#if N >= 4
case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
#endif
#if N >= 5
case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
#endif
#if N >= 6
case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
#endif
}
}
out.write(r, gid.xy, gid.z);
}
#endif // V == NORMAL
#if V == VX
kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
texture2d_array<P, access::read> in1 [[texture(1)]],
#if N >= 3
texture2d_array<P, access::read> in2 [[texture(2)]],
#endif // N >= 3
#if N >= 4
texture2d_array<P, access::read> in3 [[texture(3)]],
#endif // N >= 4
#if N >= 5
texture2d_array<P, access::read> in4 [[texture(4)]],
#endif // N >= 5
#if N >= 6
texture2d_array<P, access::read> in5 [[texture(5)]],
#endif // N >= 6
texture2d_array<P, access::write> out [[texture(N)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
int x = gid.x - pm.offset;
if (x < 0) return;
if (x < pm.vdim[0]) {
VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
out.write(r, gid.xy, gid.z);
return;
}
x -= pm.vdim[0];
if (x < pm.vdim[1]) {
VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#if N >= 3
x -= pm.vdim[1];
if (x < pm.vdim[2]) {
VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 3
#if N >= 4
x -= pm.vdim[2];
if (x < pm.vdim[3]) {
VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 4
#if N >= 5
x -= pm.vdim[3];
if (x < pm.vdim[4]) {
VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 5
#if N >= 6
x -= pm.vdim[4];
if (x < pm.vdim[5]) {
VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 6
}
#endif // V == VX
#if V == VY
kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
texture2d_array<P, access::read> in1 [[texture(1)]],
#if N >= 3
texture2d_array<P, access::read> in2 [[texture(2)]],
#endif // N >= 3
#if N >= 4
texture2d_array<P, access::read> in3 [[texture(3)]],
#endif // N >= 4
#if N >= 5
texture2d_array<P, access::read> in4 [[texture(4)]],
#endif // N >= 5
#if N >= 6
texture2d_array<P, access::read> in5 [[texture(5)]],
#endif // N >= 6
texture2d_array<P, access::write> out [[texture(N)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
int y = gid.y - pm.offset;
if (y < 0) return;
if (y < pm.vdim[0]) {
VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
out.write(r, gid.xy, gid.z);
return;
}
y -= pm.vdim[0];
if (y < pm.vdim[1]) {
VECTOR(P, 4) r = in1.read(uint2(gid.x, y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#if N >= 3
y -= pm.vdim[1];
if (y < pm.vdim[2]) {
VECTOR(P, 4) r = in2.read(uint2(gid.x, y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 3
#if N >= 4
y -= pm.vdim[2];
if (y < pm.vdim[3]) {
VECTOR(P, 4) r = in3.read(uint2(gid.x, y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 4
#if N >= 5
y -= pm.vdim[3];
if (y < pm.vdim[4]) {
VECTOR(P, 4) r = in4.read(uint2(gid.x, y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 5
#if N >= 6
y -= pm.vdim[4];
if (y < pm.vdim[5]) {
VECTOR(P, 4) r = in5.read(uint2(gid.x, y), gid.z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 6
}
#endif // V == VY
#if V == VZ
kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
texture2d_array<P, access::read> in1 [[texture(1)]],
#if N >= 3
texture2d_array<P, access::read> in2 [[texture(2)]],
#endif // N >= 3
#if N >= 4
texture2d_array<P, access::read> in3 [[texture(3)]],
#endif // N >= 4
#if N >= 5
texture2d_array<P, access::read> in4 [[texture(4)]],
#endif // N >= 5
#if N >= 6
texture2d_array<P, access::read> in5 [[texture(5)]],
#endif // N >= 6
texture2d_array<P, access::write> out [[texture(N)]],
constant ConcatParam & pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
int z = gid.z - pm.offset;
if (z < 0) return;
if (z < pm.vdim[0]) {
VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
out.write(r, gid.xy, gid.z);
return;
}
z -= pm.vdim[0];
if (z < pm.vdim[1]) {
VECTOR(P, 4) r = in1.read(gid.xy, z);
out.write(r, gid.xy, gid.z);
return;
}
#if N >= 3
z -= pm.vdim[1];
if (z < pm.vdim[2]) {
VECTOR(P, 4) r = in2.read(gid.xy, z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 3
#if N >= 4
z -= pm.vdim[2];
if (z < pm.vdim[3]) {
VECTOR(P, 4) r = in3.read(gid.xy, z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 4
#if N >= 5
z -= pm.vdim[3];
if (z < pm.vdim[4]) {
VECTOR(P, 4) r = in4.read(gid.xy, z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 5
#if N >= 6
z -= pm.vdim[4];
if (z < pm.vdim[5]) {
VECTOR(P, 4) r = in5.read(gid.xy, z);
out.write(r, gid.xy, gid.z);
return;
}
#endif // N >= 6
}
#endif // V == VZ
#undef VV
#endif // #ifdef P
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
struct ConcatParam {
int32_t odim[4];
int32_t axis;
int32_t offset;
int32_t trans[4];
int32_t vdim[6];
};
#define VNORMAL 1
#define VX 2
#define VY 3
#define VZ 4
// >> fast mode
// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half}
// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half}
// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half}
// >> normal mode (loop mode)
// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x)
// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y)
// genet: (R=4, N=2, V=normal)
// ssd-ar: (R=3, N=5, V=x)
#define V VX
#define R 3
#define N 5
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
// ssd-ar: (R=2, N=5, V=x)
#define V VX
#define R 2
#define N 5
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
// ssd-ar: (R=3, N=2, V=y)
#define V VY
#define R 3
#define N 2
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
// ssd-ar: (R=4, N=3, V=z)
#define V VZ
#define R 4
#define N 3
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
// ssd: (R=2, N=6, V=y)
#define V VY
#define R 2
#define N 6
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
// ssd: (R=3, N=6, V=y)
#define V VY
#define R 3
#define N 6
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
#define V VNORMAL
#define R 4
#define N 2
#define P float
#include "ConcatKernel.inc.metal"
#undef P
#define P half
#include "ConcatKernel.inc.metal"
#undef P
#undef N
#undef R
#undef V
#ifdef P
#define CONCAT2(a, b) a ## b
#define CONCAT2_(a, b) a ## _ ## b
#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p)
#define VECTOR(p, n) CONCAT2(p, n)
#define FUNC_R(f, r) CONCAT2_(f, r)
kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
texture2d_array<P, access::write> outTexture [[texture(1)]],
constant ReshapeParam &rp [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
ReshapeParam lrp = rp;
int oC = lrp.odim[lrp.otrans[3]];
int iC = lrp.idim[lrp.itrans[3]];
int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
VECTOR(P, 4) r;
for (int n = 0; n < 4; n++) {
oxyzn[3] = n;
#if ROUT == 4
xyzn2abcd_4(oC, oxyzn, oabcd);
#else
FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
#endif
int tabcd[4];
invtrans(lrp.otrans, oabcd, tabcd);
int index = abcd2index(lrp.odim, tabcd);
if (index < count) {
index2abcd(lrp.idim, index, tabcd);
trans(lrp.itrans, tabcd, iabcd);
abcd2xyzn(iC, iabcd, ixyzn);
#if RIN == 4
abcd2xyzn_4(iC, iabcd, ixyzn);
#else
FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
#endif
r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
} else {
r[n] = 0;
}
}
outTexture.write(r, gid.xy, gid.z);
}
#endif
......@@ -8,7 +8,7 @@
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONRITIONS OF ANY KINR, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
......@@ -24,114 +24,127 @@ struct ReshapeParam {
int32_t otrans[4];
};
//kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]],
// texture2d_array<float, access::write> outTexture [[texture(1)]],
// constant ReshapeParam &rp [[buffer(0)]],
// uint3 gid [[thread_position_in_grid]]) {
// if (gid.x >= outTexture.get_width() ||
// gid.y >= outTexture.get_height() ||
// gid.z >= outTexture.get_array_size()) return;
//
// int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4];
// ReshapeParam lrp = rp;
// int oC = lrp.odim[lrp.otrans[3]];
// int iC = lrp.idim[lrp.itrans[3]];
// int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
// float4 r;
// for (int n = 0; n < 4; n++) {
// oxyzn[3] = n;
//
// //4 (gid.x gid.y, gid.z, 0~4)
// xyzn2abcd(oC, oxyzn, oabcd);
// int tabcd[4];
// invtrans(lrp.otrans, oabcd, tabcd);
// int index = abcd2index(lrp.odim, tabcd);
// if (index < count) {
// int c = index % 4;
//
// int temp0 = index % (inTexture.get_array_size() * 4);
// int slice = temp0 / 4;
//
// int temp1 = index % (inTexture.get_array_size() * 4 * lrp.idim[2]);
// int w = temp1 / (inTexture.get_array_size() * 4);
//
// int h = index / (inTexture.get_array_size() * 4 * lrp.idim[2]);
//
//// index2abcd(lrp.idim, index, tabcd);
//// abcd2xyzn(iC, tabcd, ixyzn);
// r[n] = inTexture.read(uint2(w, h), slice)[c];
// } else {
// r[n] = 0;
// }
// }
// outTexture.write(r, gid.xy, gid.z);
//}
#define P float
#define RIN 4
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
#define RIN 3
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
#define RIN 2
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
#define RIN 1
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant ReshapeParam &rp [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
#undef P
int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
ReshapeParam lrp = rp;
int oC = lrp.odim[lrp.otrans[3]];
int iC = lrp.idim[lrp.itrans[3]];
int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
float4 r;
for (int n = 0; n < 4; n++) {
oxyzn[3] = n;
xyzn2abcd(oC, oxyzn, oabcd);
int tabcd[4];
invtrans(lrp.otrans, oabcd, tabcd);
int index = abcd2index(lrp.odim, tabcd);
if (index < count) {
index2abcd(lrp.idim, index, tabcd);
trans(lrp.itrans, tabcd, iabcd);
abcd2xyzn(iC, iabcd, ixyzn);
r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
} else {
r[n] = 0;
}
}
outTexture.write(r, gid.xy, gid.z);
}
#define P half
#define RIN 4
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
#define RIN 3
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant ReshapeParam &rp [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
ReshapeParam lrp = rp;
int oC = lrp.odim[lrp.otrans[3]];
int iC = lrp.idim[lrp.itrans[3]];
int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
half4 r;
for (int n = 0; n < 4; n++) {
oxyzn[3] = n;
xyzn2abcd(oC, oxyzn, oabcd);
int tabcd[4];
invtrans(lrp.otrans, oabcd, tabcd);
int index = abcd2index(lrp.odim, tabcd);
if (index < count) {
index2abcd(lrp.idim, index, tabcd);
trans(lrp.itrans, tabcd, iabcd);
abcd2xyzn(iC, iabcd, ixyzn);
r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
} else {
r[n] = 0;
}
}
outTexture.write(r, gid.xy, gid.z);
}
#define RIN 2
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
#define RIN 1
#define ROUT 4
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 3
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 2
#include "ReshapeKernel.inc.metal"
#undef ROUT
#define ROUT 1
#include "ReshapeKernel.inc.metal"
#undef ROUT
#undef RIN
#undef P
#ifdef P
#define CONCAT2(a, b) a ## b
#define CONCAT2_(a, b) a ## _ ## b
#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
#define VECTOR(p, n) CONCAT2(p, n)
#define FUNC_R(f, r) CONCAT2_(f, r)
kernel void FUNC(split, R, N, V, P)(texture2d_array<P, access::read> input [[texture(0)]],
texture2d_array<P, access::write> out1 [[texture(1)]],
texture2d_array<P, access::write> out2 [[texture(2)]],
#if N >= 3
texture2d_array<P, access::write> out3 [[texture(3)]],
#endif
#if N >= 4
texture2d_array<P, access::write> out4 [[texture(4)]],
#endif
constant SplitParam &sp [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
VECTOR(P, 4) r = input.read(gid.xy, gid.z);
#if V == y
int y = gid.y - sp.offset;
if (y < sp.vdim[0]) {
out1.write(r, gid.xy, gid.z);
} else {
y -= sp.vdim[0];
if (y < sp.vdim[1]) {
out2.write(r, uint2(gid.x, y), gid.z);
} else {
#if N >= 3
y -= sp.vdim[1];
if (y < sp.vdim[2]) {
out3.write(r, uint2(gid.x, y), gid.z);
} else {
#if N >= 4
y -= sp.vdim[2];
if (y < sp.vdim[3]) {
out4.write(r, uint2(gid.x, y), gid.z);
}
#endif
}
#endif
}
}
#elif V == x
int x = gid.x;
if (x < sp.vdim[0]) {
out1.write(r, gid.xy, gid.z);
} else {
x -= sp.vdim[0];
if (x < sp.vdim[1]) {
out2.write(r, uint2(x, gid.y), gid.z);
} else {
#if N >= 3
x -= sp.vdim[1];
if (x < sp.vdim[2]) {
out3.write(r, uint2(x, gid.y), gid.z);
} else {
#if N >= 4
x -= sp.vdim[2];
if (x < sp.vdim[3]) {
out4.write(r, uint2(x, gid.y), gid.z);
}
#endif
}
#endif
}
}
#else
#endif
}
#endif
......@@ -13,18 +13,60 @@
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
using namespace metal;
kernel void split(texture2d_array<float, access::write> output[[texture(0)]],
uint3 gid [[thread_position_in_grid]]) {
float4 r;
struct SplitParam {
int32_t idim[4];
int32_t axis;
int32_t offset;
int32_t trans[4];
int32_t vdim[4];
};
// only support split_{2, 3, 4}_{2, 3, 4}_y_{float, half}
// only support split_{3, 4}_{2, 3, 4}_x_{float, half}
#define V y
// for R in 2..4
#define R 3
// for N in 2..4
#define N 2
#define P float
#include "Split.inc.metal"
#undef P
#define P half
#include "Split.inc.metal"
#undef P
#undef N
// end for N
#undef R
// end for R
#undef V
#define V x
// for R in 3..4
#define R 3
// for N in 2..4
#define N 2
#define P float
#include "Split.inc.metal"
#undef P
#define P half
#include "Split.inc.metal"
#undef P
#undef N
// end for N
output.write(r, gid.xy, gid.z);
}
#undef R
// end for R
#undef V
kernel void split_half(texture2d_array<half, access::write> output[[texture(0)]],
uint3 gid [[thread_position_in_grid]]) {
float4 r;
output.write(half4(r), gid.xy, gid.z);
}
......@@ -43,15 +43,12 @@ class ReshapeParam<P: PrecisionType>: OpParam {
}
output.padToFourDim = Dim.init(inDim: dim)
output.dim = output.padToFourDim
// inplace = try ReshapeParam.getAttr(key: "inplace", attrs: opDesc.attrs)
} catch let error {
throw error
}
}
let input: Texture<P>
let shape: [Int32]
// let inplace: Bool
var output: Texture<P>
}
......
......@@ -18,17 +18,19 @@ class ShapeParam<P: PrecisionType>: OpParam {
typealias ParamPrecisionType = P
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
output = try ShapeParam.output(outputs: opDesc.outputs, from: inScope)
input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
} catch let error {
throw error
}
}
var output: Texture<P>
let input: Texture<P>
}
class ShapeOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
class ShapeOp<P: PrecisionType>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{
typealias OpType = SplitOp<P>
typealias OpType = ShapeOp<P>
func inferShape() {
// para.output.dim = para.input.dim
......
......@@ -18,13 +18,32 @@ class SplitParam<P: PrecisionType>: OpParam {
typealias ParamPrecisionType = P
required init(opDesc: OpDesc, inScope: Scope) throws {
do {
// output = try SplitParam.output(outputs: opDesc.outputs, from: inScope)
output = try SplitParam.outputOut(outputs: opDesc.outputs, from: inScope)
input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
output = Texture<P>.init(device: input.metalTexture!.device, inDim: input.dim)
axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
if axis < 0 {
axis = input.tensorDim.cout() + axis
}
guard let outlist = opDesc.outputs["Out"] else {
fatalError()
}
for out in outlist {
guard let variant = inScope[out], let v = variant as? Texture<P> else {
fatalError()
}
outputList.append(v)
sections.append(Int32(v.tensorDim.dims[axis]))
}
} catch let error {
throw error
}
}
var axis: Int
let input: Texture<P>
var output: Texture<P>
var outputList: [Texture<P>] = []
var sections: [Int32] = []
}
class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
......
......@@ -16,7 +16,7 @@ import Foundation
class ScaleKernel: CusomKernel {
init(device: MTLDevice, shape: Shape) {
super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false)
super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
}
}
......
......@@ -14,10 +14,10 @@
import Foundation
let testTo = 3
let testTo = 114
var isTest = false
let computePrecision: ComputePrecision = .Float16
let computePrecision: ComputePrecision = .Float32
public class ResultHolder {
public let dim: [Int]
......@@ -101,7 +101,7 @@ public class Executor<P: PrecisionType> {
let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: dim))
program.scope.setInput(input: inputTexture)
//(ops.count - except)
for i in 0..<ops.count {
for i in 0..<testTo {
let op = ops[i]
do {
try op.run(device: device, buffer: buffer)
......@@ -112,35 +112,35 @@ public class Executor<P: PrecisionType> {
var outputTextures: [String : [Variant]]?
if except > 0 {
outputTextures = ops[ops.count - except].inputVariant()
outputTextures = ops[testTo-1].inputVariant()
}
buffer.addCompletedHandler { [weak self] (commandbuffer) in
// let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
//// print(inputArr.strideArray())
let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
print(inputArr.strideArray())
// print(dim)
// writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
//
// print("write to library done")
// return
// print(inputArr)
// let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
// print(stridableInput)
// let _: Flo? = input.logDesc(header: "input: ", stridable: true)
// for i in 0..<self.ops.count {
// let op = self.ops[i]
// print(" 第 \(i) 个 op: ")
// op.delogOutput()
// }
// print(inputArr)
//
// let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
// print(stridableInput)
//
// let _: Flo? = input.logDesc(header: "input: ", stridable: true)
for i in 0..<testTo {
let op = self!.ops[i]
print(" 第 \(i) 个 op: ")
op.delogOutput()
}
// return;
// self.ops[testTo - 2].delogOutput()
// self.ops[testTo - 1].delogOutput()
// self.ops[60].delogOutput()
// self!.ops[testTo - 2].delogOutput()
// self!.ops[testTo - 1].delogOutput()
// self!.ops[60].delogOutput()
// return
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册