From 92577d1a50518c52a194466e1ac3d2dc6ca7f88f Mon Sep 17 00:00:00 2001 From: liuruilong Date: Sat, 28 Jul 2018 23:32:29 +0800 Subject: [PATCH] metal cun run --- .../xcschemes/paddle-mobile-demo.xcscheme | 2 +- .../paddle-mobile-demo/PreProcessKernel.metal | 16 +- .../paddle-mobile-demo/ViewController.swift | 49 ++-- .../paddle-mobile/Common/MetalExtension.swift | 4 +- .../paddle-mobile/Common/Tools.swift | 2 +- .../paddle-mobile/Executor.swift | 24 +- .../paddle-mobile/paddle-mobile/Loader.swift | 3 +- .../Operators/ConvAddBatchNormReluOp.swift | 14 +- .../Kernels/ConvAddBatchNormReluKernel.swift | 14 +- .../Operators/Kernels/ConvAddKernel.swift | 3 + .../Operators/Kernels/ConvKernel.metal | 258 ++++++++++++++---- .../Operators/Kernels/ConvKernel.swift | 1 + .../Operators/Kernels/Kernels.metal | 78 ++++++ .../Kernels/Texture2DTo2DArrayKernel.swift | 2 - .../paddle-mobile/framework/Tensor.swift | 76 +++++- .../paddle-mobile/framework/Texture.swift | 6 +- 16 files changed, 434 insertions(+), 118 deletions(-) diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme index 46c65bd36a..de579675e0 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme +++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme @@ -42,7 +42,7 @@ inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) +{ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); +} + diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift index 5e96655e76..1d8d50e1c9 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift @@ -26,9 +26,12 @@ class PreProccess: CusomKernel { } } - class ViewController: UIViewController { var textureLoader: MTKTextureLoader! + var program: Program! + var executor: Executor! + var preprocessKernel: PreProccess! + // let queue: MTLCommandQueue func scaleTexture(queue: MTLCommandQueue, input: MTLTexture, complete: @escaping (MTLTexture) -> Void) { let tmpTextureDes = MTLTextureDescriptor.init() @@ -57,18 +60,9 @@ class ViewController: UIViewController { unitTest.testConvAddBnRelu() } - override func viewDidLoad() { - super.viewDidLoad() - - if openTest { - print(" - testing - ") - unitTest() - return - } - - - -// return + override func touchesBegan(_ touches: Set, with event: UIEvent?) { + super.touchesBegan(touches, with: event) + // return let queue = MetalHelper.shared.queue textureLoader = MTKTextureLoader.init(device: MetalHelper.shared.device) @@ -81,22 +75,33 @@ class ViewController: UIViewController { guard let inTexture = texture else { fatalError(" texture is nil !") } - + scaleTexture(queue: queue, input: inTexture) { (inputTexture) in - let loader = Loader.init() do { - let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null" - let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null" - let program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath) - let executor = try Executor.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program) - let preprocessKernel = PreProccess.init(device: MetalHelper.shared.device) - try executor.predict(input: inputTexture, expect: [1, 224, 224, 3], completionHandle: { (result) in + try self.executor.predict(input: inputTexture, expect: [1, 224, 224, 3], completionHandle: { (result) in print(result.resultArr.top(r: 5)) - }, preProcessKernle: preprocessKernel) + }, preProcessKernle: self.preprocessKernel) } catch let error { print(error) } } } + + override func viewDidLoad() { + super.viewDidLoad() + + let queue = MetalHelper.shared.queue + let loader = Loader.init() + preprocessKernel = PreProccess.init(device: MetalHelper.shared.device) + + do { + let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null" + let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null" + program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath) + executor = try Executor.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program) + } catch let error { + print(error) + } + } } diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift index af4c01d5fe..b750018260 100644 --- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift +++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift @@ -120,8 +120,8 @@ extension MTLComputeCommandEncoder { let groupDepth = slices let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth) - print("groups: \(groups) ") - print("threads per group: \(threadsPerGroup)") +// print("groups: \(groups) ") +// print("threads per group: \(threadsPerGroup)") setComputePipelineState(computePipline) diff --git a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift index cc1f7a4f21..930198fbf9 100644 --- a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift +++ b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift @@ -8,7 +8,6 @@ import Foundation - func writeToLibrary(fileName: String, array: [P]) { let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last ?! " library path get error " let filePath = libraryPath + "/" + fileName @@ -19,3 +18,4 @@ func writeToLibrary(fileName: String, array: [P]) { fileHandler.write(data) fileHandler.closeFile() } + diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift index e883754b2c..40a80d2ddf 100644 --- a/metal/paddle-mobile/paddle-mobile/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Executor.swift @@ -57,7 +57,7 @@ public class Executor { queue = inQueue for block in inProgram.programDesc.blocks { //block.ops.count - for i in 0..<2 { + for i in 0...shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope) @@ -109,20 +109,26 @@ public class Executor { } buffer.addCompletedHandler { (commandbuffer) in - let inputArr = resInput.floatArray(res: { (p:P) -> P in - return p - }) +// let inputArr = resInput.floatArray(res: { (p:P) -> P in +// return p +// }) // print(inputArr) // let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray() // print(stridableInput) // let _: Flo? = input.logDesc(header: "input: ", stridable: true) - for op in self.ops { - op.delogOutput() - } - return +// for op in self.ops { +// op.delogOutput() +// } +// return + +// self.ops[2].delogOutput() + + let afterDate = Date.init() + print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))") + guard let outputVar = self.program.scope.output() else { fatalError("output nil") } @@ -134,8 +140,6 @@ public class Executor { return p })) completionHandle(resultHodlder) - let afterDate = Date.init() - print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))") } buffer.commit() } diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift index 31fd21ebd4..472c588430 100644 --- a/metal/paddle-mobile/paddle-mobile/Loader.swift +++ b/metal/paddle-mobile/paddle-mobile/Loader.swift @@ -15,7 +15,6 @@ import Foundation import SwiftProtobuf - public class Loader { class ParaLoader { let file: UnsafeMutablePointer @@ -163,7 +162,7 @@ public class Loader { throw error } tensor.convert(to: .NHWC) - tensor.initBuffer(device: device) +// tensor.initBuffer(device: device) scope[varDesc.name] = tensor } else { let dim = Dim.init(inDim: tensorDesc.NHWCDim) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift index 8746ba980d..f24e25b054 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift @@ -116,9 +116,17 @@ class ConvAddBatchNormReluOp: Operator P in + return p + } +// + writeToLibrary(fileName: "output_112x112x32_2", array: output) + print(" write done") + // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift index e8ee935390..0ffe90272f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -58,6 +58,14 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") } + param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) + param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) + + param.variance.initBuffer(device: device) + param.mean.initBuffer(device: device) + param.scale.initBuffer(device: device) + param.bias.initBuffer(device: device) + let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetY = param.filter.height/2 - Int(param.paddings[1]) @@ -70,7 +78,7 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) - for i in 0...stride { + for i in 0...stride { let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) invs.append(P(inv)) } @@ -78,7 +86,7 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) - let scaleContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) + let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) for i in 0...stride { @@ -100,7 +108,6 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable throw PaddleMobileError.predictError(message: " encode is nil") } - print("ConvAddBatchNormReluKernel compute") encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.output.metalTexture, index: 1) encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) @@ -117,7 +124,6 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable fatalError() } - print("ConvAddBatchNormReluKernel compute") encoder.setTexture(param.inputTexture, index: 0) encoder.setTexture(param.outputTexture, index: 1) var inMetalParam = param.metalParam diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift index 81bfda997c..1f24a1642e 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift @@ -21,6 +21,9 @@ class ConvAddKernel: Kernel, Computable { let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetY = param.filter.height/2 - Int(param.paddings[1]) + param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) + param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) + print("offset x: \(offsetX)") print("offset y: \(offsetY)") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal index a738d55e39..9d0c6de35e 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal @@ -24,53 +24,58 @@ struct MetalConvParam { }; -//kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], -// texture2d_array outTexture [[texture(1)]], -// constant MetalConvParam ¶m [[buffer(0)]], -// const device half4 *weights [[buffer(1)]], -// const device half4 *biase [[buffer(2)]], -// const device half4 *new_scale [[buffer(3)]], -// const device half4 *new_biase [[buffer(4)]], -// uint3 gid [[thread_position_in_grid]]) { -// -// if (gid.x >= outTexture.get_width() || -// gid.y >= outTexture.get_height() || -// gid.z >= outTexture.get_array_size()) { -// return; -// } -// -// short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY); -// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); -// const uint wightSliceCount = 36; -// uint weithTo = gid.z * wightSliceCount * inTexture.get_array_size(); -// half4 output = 0.0; -// for (uint i = 0; i < inTexture.get_array_size(); ++i) { -// half4 input[9]; -// input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); -// input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); -// input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); -// input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); -// input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); -// input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); -// input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); -// input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); -// input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); -// for (int j = 0; j < 9; ++j) { -// half4 weight = weights[weithTo + wightSliceCount * i + j * 4]; -// output += dot(input[j], weight); -// } -// } -// -// output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0h); -// outTexture.write(output, gid.xy, gid.z); -// -//} +kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = half4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); +} -kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], +kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], const device float4 *new_scale [[buffer(3)]], const device float4 *new_biase [[buffer(4)]], uint3 gid [[thread_position_in_grid]]) { @@ -89,9 +94,9 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = float4(0.0); + half4 output = half4(0.0); - float4 input[9]; + half4 input[9]; for (uint i = 0; i < input_arr_size; ++i) { input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); @@ -103,23 +108,113 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.x += dot(input[j], weight_x); - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.y += dot(input[j], weight_y); - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.z += dot(input[j], weight_z); - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.w += dot(input[j], weight_w); } } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = half4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = output + biase[gid.z]; outTexture.write(output, gid.xy, gid.z); } +kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + half4 output = half4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); +} + + +/*---------------------------------------------*/ + + + kernel void conv_add_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant MetalConvParam ¶m [[buffer(0)]], @@ -165,6 +260,60 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array outTexture.write(output, gid.xy, gid.z); } +kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + kernel void conv_add_1x1(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant MetalConvParam ¶m [[buffer(0)]], @@ -208,7 +357,6 @@ kernel void conv_add_1x1(texture2d_array inTexture [[text outTexture.write(output, gid.xy, gid.z); } - kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant MetalConvParam ¶m [[buffer(0)]], @@ -224,7 +372,6 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array: Kernel, Computable { let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let offsetZ = 0.0 + param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3])) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal index 5b797178f5..92ee118452 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal @@ -96,6 +96,17 @@ kernel void texture2d_to_2d_array(texture2d inTexture [[tex } +kernel void texture2d_to_2d_array_half(texture2d inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height()){ + return; + } + const half4 input = inTexture.read(gid.xy); + outTexture.write(input, gid.xy, 0); +} + struct PoolParam { int ksizeX; int ksizeY; @@ -140,6 +151,39 @@ kernel void pool(texture2d_array inTexture [[texture(0)]], } +kernel void pool_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant PoolParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + int xmin = gid.x * pm.strideX - pm.paddingX; + int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); + xmin = max(xmin, 0); + int ymin = gid.y * pm.strideX - pm.paddingX; + int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); + ymin = max(ymin, 0); + + half4 r = 0; + if (pm.poolType == 0) { + r = inTexture.read(uint2(xmin, ymin), gid.z); + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r = fmax(r, inTexture.read(uint2(x, y), gid.z)); + } + } + } else if (pm.poolType == 1) { + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r += inTexture.read(uint2(x, y), gid.z); + } + } + r /= pm.ksizeX * pm.ksizeY; + } + outTexture.write(r, gid.xy, gid.z); +} + kernel void reshape(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], uint3 gid [[thread_position_in_grid]]) { @@ -151,6 +195,17 @@ kernel void reshape(texture2d_array inTexture [[texture(0)] outTexture.write(r, gid.xy, gid.z); } +kernel void reshape_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + + half4 r = inTexture.read(uint2(0, 0), gid.z); + outTexture.write(r, gid.xy, gid.z); +} + kernel void softmax(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], uint3 gid [[thread_position_in_grid]]) { @@ -172,3 +227,26 @@ kernel void softmax(texture2d_array inTexture [[texture(0)] rr = exp(rr - maxv) / sum; outTexture.write(rr, gid.xy, gid.z); } + + +kernel void softmax_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + int zsize = inTexture.get_array_size(); + half maxv = inTexture.read(uint2(0, 0), 0)[0]; + for (int z = 0; z < zsize; z++) { + half4 r = inTexture.read(uint2(0, 0), z); + maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3]))); + } + float sum = 0; + for (int z = 0; z < zsize; z++) { + half4 r = inTexture.read(uint2(0, 0), z); + sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv); + } + half4 rr = inTexture.read(gid.xy, gid.z); + rr = exp(rr - maxv) / sum; + outTexture.write(rr, gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift index 3bf912c873..b524c3ac80 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift @@ -20,7 +20,6 @@ struct Texture2DTo2DArrayParam { let expectDim: Dim } - class Texture2DTo2DArrayKernel: Kernel, Computable{ func compute(commandBuffer: MTLCommandBuffer, param: FeedParam

) throws { guard let encoder = commandBuffer.makeComputeCommandEncoder() else { @@ -36,4 +35,3 @@ class Texture2DTo2DArrayKernel: Kernel, Computable{ super.init(device: device, inFunctionName: "texture2d_to_2d_array") } } - diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift index 39891c77de..beed19b653 100644 --- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift +++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +import Accelerate import Foundation protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{ @@ -27,6 +28,10 @@ extension Tensorial { } class Tensor: Tensorial { + enum BufferPrecision { + case Float32, Float16 + } + var data: Data var dim: Dim var buffer: MTLBuffer! @@ -88,7 +93,28 @@ class Tensor: Tensorial { layout = to } - func initBuffer(device: MTLDevice) { + func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) { + var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) + var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) + guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { + fatalError(" float 32 to float 16 error ! ") + } + } + + func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) { + guard let floatPointer = data.pointer as? UnsafeMutablePointer else { + fatalError(" not support yet ") + } + + + let precisionSize: Int + switch precision { + case .Float32: + precisionSize = 4 + case .Float16: + precisionSize = 2 + } + if dim.cout() == 4 { if layout == .NHWC { let C = dim[3] @@ -96,29 +122,55 @@ class Tensor: Tensorial { let paddedC = cSlices * 4 let count = paddedC * dim[0] * dim[1] * dim[2] if C == paddedC { - buffer = device.makeBuffer(length: count * MemoryLayout

.stride) - buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + buffer = device.makeBuffer(length: count * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) + } } else if C == 1 { - buffer = device.makeBuffer(length: numel() * MemoryLayout

.stride) - buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + buffer = device.makeBuffer(length: numel() * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) + } } else { - buffer = device.makeBuffer(length: count * MemoryLayout

.stride) - var tmpPointer = data.pointer - var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count) + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) + var tmpPointer = floatPointer + var dstPtr = convertedPointer for _ in 0...stride) + case .Float16: + float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) } + + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() } } } else if dim.cout() == 1 { - buffer = device.makeBuffer(length: numel() * MemoryLayout

.stride) - buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + buffer = device.makeBuffer(length: numel() * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) + } } else { fatalError(" not support !") } diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift index 50f9f7d067..81894664c5 100644 --- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift +++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift @@ -68,16 +68,18 @@ public class Texture: Tensorial { } else { fatalError(" not suuprt ") } + if MemoryLayout

.size == 1 { tmpTextureDes.pixelFormat = .rgba8Unorm } else if MemoryLayout

.size == 2 { - tmpTextureDes.pixelFormat = .rgba32Float + tmpTextureDes.pixelFormat = .rgba16Float } else if MemoryLayout

.size == 4 { // tmpTextureDes.pixelFormat = .r32Float tmpTextureDes.pixelFormat = .rgba32Float } - +// tmpTextureDes.pixelFormat = .rgba16Float + tmpTextureDes.usage = [.shaderRead, .shaderWrite] tmpTextureDes.storageMode = .shared textureDesc = tmpTextureDes -- GitLab