提交 92577d1a 编写于 作者: L liuruilong

metal cun run

上级 c3c7b07d
......@@ -42,7 +42,7 @@
</AdditionalOptions>
</TestAction>
<LaunchAction
buildConfiguration = "Debug"
buildConfiguration = "Release"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
launchStyle = "0"
......
......@@ -20,10 +20,24 @@ kernel void preprocess(
return;
}
const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
const float4 inColor = (float4(float4(inTexture.read(gid))) * 255.0f - means) * 0.017f;
const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
}
kernel void preprocess_half(
texture2d<half, access::read> inTexture [[texture(0)]],
texture2d<half, access::write> outTexture [[texture(1)]],
uint2 gid [[thread_position_in_grid]])
{
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height()) {
return;
}
const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
}
......
......@@ -26,9 +26,12 @@ class PreProccess: CusomKernel {
}
}
class ViewController: UIViewController {
var textureLoader: MTKTextureLoader!
var program: Program!
var executor: Executor<Float32>!
var preprocessKernel: PreProccess!
// let queue: MTLCommandQueue
func scaleTexture(queue: MTLCommandQueue, input: MTLTexture, complete: @escaping (MTLTexture) -> Void) {
let tmpTextureDes = MTLTextureDescriptor.init()
......@@ -57,18 +60,9 @@ class ViewController: UIViewController {
unitTest.testConvAddBnRelu()
}
override func viewDidLoad() {
super.viewDidLoad()
if openTest {
print(" - testing - ")
unitTest()
return
}
// return
override func touchesBegan(_ touches: Set<UITouch>, with event: UIEvent?) {
super.touchesBegan(touches, with: event)
// return
let queue = MetalHelper.shared.queue
textureLoader = MTKTextureLoader.init(device: MetalHelper.shared.device)
......@@ -81,22 +75,33 @@ class ViewController: UIViewController {
guard let inTexture = texture else {
fatalError(" texture is nil !")
}
scaleTexture(queue: queue, input: inTexture) { (inputTexture) in
let loader = Loader<Float32>.init()
do {
let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
let program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath)
let executor = try Executor<Float32>.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program)
let preprocessKernel = PreProccess.init(device: MetalHelper.shared.device)
try executor.predict(input: inputTexture, expect: [1, 224, 224, 3], completionHandle: { (result) in
try self.executor.predict(input: inputTexture, expect: [1, 224, 224, 3], completionHandle: { (result) in
print(result.resultArr.top(r: 5))
}, preProcessKernle: preprocessKernel)
}, preProcessKernle: self.preprocessKernel)
} catch let error {
print(error)
}
}
}
override func viewDidLoad() {
super.viewDidLoad()
let queue = MetalHelper.shared.queue
let loader = Loader<Float32>.init()
preprocessKernel = PreProccess.init(device: MetalHelper.shared.device)
do {
let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath)
executor = try Executor<Float32>.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program)
} catch let error {
print(error)
}
}
}
......@@ -120,8 +120,8 @@ extension MTLComputeCommandEncoder {
let groupDepth = slices
let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
print("groups: \(groups) ")
print("threads per group: \(threadsPerGroup)")
// print("groups: \(groups) ")
// print("threads per group: \(threadsPerGroup)")
setComputePipelineState(computePipline)
......
......@@ -8,7 +8,6 @@
import Foundation
func writeToLibrary<P: PrecisionType>(fileName: String, array: [P]) {
let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last ?! " library path get error "
let filePath = libraryPath + "/" + fileName
......@@ -19,3 +18,4 @@ func writeToLibrary<P: PrecisionType>(fileName: String, array: [P]) {
fileHandler.write(data)
fileHandler.closeFile()
}
......@@ -57,7 +57,7 @@ public class Executor<P: PrecisionType> {
queue = inQueue
for block in inProgram.programDesc.blocks {
//block.ops.count
for i in 0..<2 {
for i in 0..<block.ops.count {
let op = block.ops[i]
do {
let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
......@@ -109,20 +109,26 @@ public class Executor<P: PrecisionType> {
}
buffer.addCompletedHandler { (commandbuffer) in
let inputArr = resInput.floatArray(res: { (p:P) -> P in
return p
})
// let inputArr = resInput.floatArray(res: { (p:P) -> P in
// return p
// })
// print(inputArr)
// let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
// print(stridableInput)
// let _: Flo? = input.logDesc(header: "input: ", stridable: true)
for op in self.ops {
op.delogOutput()
}
return
// for op in self.ops {
// op.delogOutput()
// }
// return
// self.ops[2].delogOutput()
let afterDate = Date.init()
print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
guard let outputVar = self.program.scope.output() else {
fatalError("output nil")
}
......@@ -134,8 +140,6 @@ public class Executor<P: PrecisionType> {
return p
}))
completionHandle(resultHodlder)
let afterDate = Date.init()
print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
}
buffer.commit()
}
......
......@@ -15,7 +15,6 @@
import Foundation
import SwiftProtobuf
public class Loader<P: PrecisionType> {
class ParaLoader {
let file: UnsafeMutablePointer<FILE>
......@@ -163,7 +162,7 @@ public class Loader<P: PrecisionType> {
throw error
}
tensor.convert(to: .NHWC)
tensor.initBuffer(device: device)
// tensor.initBuffer(device: device)
scope[varDesc.name] = tensor
} else {
let dim = Dim.init(inDim: tensorDesc.NHWCDim)
......
......@@ -116,9 +116,17 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
// print("padding: \(para.paddings)")
// print("stride: \(para.stride)")
let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
// let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
// let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
// let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
let output = para.output.metalTexture.floatArray { (p: P) -> P in
return p
}
//
writeToLibrary(fileName: "output_112x112x32_2", array: output)
print(" write done")
// let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
}
}
......@@ -58,6 +58,14 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
}
param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
param.variance.initBuffer(device: device)
param.mean.initBuffer(device: device)
param.scale.initBuffer(device: device)
param.bias.initBuffer(device: device)
let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
......@@ -70,7 +78,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
var invs: [P] = []
let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
invs.append(P(inv))
}
......@@ -78,7 +86,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
let scaleContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
......@@ -100,7 +108,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
throw PaddleMobileError.predictError(message: " encode is nil")
}
print("ConvAddBatchNormReluKernel compute")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
......@@ -117,7 +124,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
fatalError()
}
print("ConvAddBatchNormReluKernel compute")
encoder.setTexture(param.inputTexture, index: 0)
encoder.setTexture(param.outputTexture, index: 1)
var inMetalParam = param.metalParam
......
......@@ -21,6 +21,9 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
print("offset x: \(offsetX)")
print("offset y: \(offsetY)")
......
......@@ -24,53 +24,58 @@ struct MetalConvParam {
};
//kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> inTexture [[texture(0)]],
// texture2d_array<half, access::write> outTexture [[texture(1)]],
// constant MetalConvParam &param [[buffer(0)]],
// const device half4 *weights [[buffer(1)]],
// const device half4 *biase [[buffer(2)]],
// const device half4 *new_scale [[buffer(3)]],
// const device half4 *new_biase [[buffer(4)]],
// uint3 gid [[thread_position_in_grid]]) {
//
// if (gid.x >= outTexture.get_width() ||
// gid.y >= outTexture.get_height() ||
// gid.z >= outTexture.get_array_size()) {
// return;
// }
//
// short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
// const uint wightSliceCount = 36;
// uint weithTo = gid.z * wightSliceCount * inTexture.get_array_size();
// half4 output = 0.0;
// for (uint i = 0; i < inTexture.get_array_size(); ++i) {
// half4 input[9];
// input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
// input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
// input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
// input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
// input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
// input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
// input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
// input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
// input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
// for (int j = 0; j < 9; ++j) {
// half4 weight = weights[weithTo + wightSliceCount * i + j * 4];
// output += dot(input[j], weight);
// }
// }
//
// output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0h);
// outTexture.write(output, gid.xy, gid.z);
//
//}
kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
......@@ -89,9 +94,9 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 output = half4(0.0);
float4 input[9];
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
......@@ -103,23 +108,113 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
half4 output = half4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(input, weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(input, weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(input, weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(input, weight_w);
}
output = output + biase[gid.z];
outTexture.write(output, gid.xy, gid.z);
}
kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
half4 output = half4(0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
outTexture.write(output, gid.xy, gid.z);
}
/*---------------------------------------------*/
kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
......@@ -165,6 +260,60 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
float4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
......@@ -208,7 +357,6 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
outTexture.write(output, gid.xy, gid.z);
}
kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
......@@ -224,7 +372,6 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
......@@ -248,7 +395,6 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
output = (output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z];
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
outTexture.write(output, gid.xy, gid.z);
}
......@@ -31,6 +31,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
let offsetZ = 0.0
param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
}
......
......@@ -96,6 +96,17 @@ kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[tex
}
kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= inTexture.get_width() ||
gid.y >= inTexture.get_height()){
return;
}
const half4 input = inTexture.read(gid.xy);
outTexture.write(input, gid.xy, 0);
}
struct PoolParam {
int ksizeX;
int ksizeY;
......@@ -140,6 +151,39 @@ kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
}
kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant PoolParam &pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
int xmin = gid.x * pm.strideX - pm.paddingX;
int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
xmin = max(xmin, 0);
int ymin = gid.y * pm.strideX - pm.paddingX;
int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
ymin = max(ymin, 0);
half4 r = 0;
if (pm.poolType == 0) {
r = inTexture.read(uint2(xmin, ymin), gid.z);
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r = fmax(r, inTexture.read(uint2(x, y), gid.z));
}
}
} else if (pm.poolType == 1) {
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r += inTexture.read(uint2(x, y), gid.z);
}
}
r /= pm.ksizeX * pm.ksizeY;
}
outTexture.write(r, gid.xy, gid.z);
}
kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
uint3 gid [[thread_position_in_grid]]) {
......@@ -151,6 +195,17 @@ kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]
outTexture.write(r, gid.xy, gid.z);
}
kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
half4 r = inTexture.read(uint2(0, 0), gid.z);
outTexture.write(r, gid.xy, gid.z);
}
kernel void softmax(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
uint3 gid [[thread_position_in_grid]]) {
......@@ -172,3 +227,26 @@ kernel void softmax(texture2d_array<float, access::read> inTexture [[texture(0)]
rr = exp(rr - maxv) / sum;
outTexture.write(rr, gid.xy, gid.z);
}
kernel void softmax_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
int zsize = inTexture.get_array_size();
half maxv = inTexture.read(uint2(0, 0), 0)[0];
for (int z = 0; z < zsize; z++) {
half4 r = inTexture.read(uint2(0, 0), z);
maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3])));
}
float sum = 0;
for (int z = 0; z < zsize; z++) {
half4 r = inTexture.read(uint2(0, 0), z);
sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv);
}
half4 rr = inTexture.read(gid.xy, gid.z);
rr = exp(rr - maxv) / sum;
outTexture.write(rr, gid.xy, gid.z);
}
......@@ -20,7 +20,6 @@ struct Texture2DTo2DArrayParam {
let expectDim: Dim
}
class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
......@@ -36,4 +35,3 @@ class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
super.init(device: device, inFunctionName: "texture2d_to_2d_array")
}
}
......@@ -12,6 +12,7 @@
See the License for the specific language governing permissions and
limitations under the License. */
import Accelerate
import Foundation
protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
......@@ -27,6 +28,10 @@ extension Tensorial {
}
class Tensor<P: PrecisionType>: Tensorial {
enum BufferPrecision {
case Float32, Float16
}
var data: Data
var dim: Dim
var buffer: MTLBuffer!
......@@ -88,7 +93,28 @@ class Tensor<P: PrecisionType>: Tensorial {
layout = to
}
func initBuffer(device: MTLDevice) {
func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4)
var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
fatalError(" float 32 to float 16 error ! ")
}
}
func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
fatalError(" not support yet ")
}
let precisionSize: Int
switch precision {
case .Float32:
precisionSize = 4
case .Float16:
precisionSize = 2
}
if dim.cout() == 4 {
if layout == .NHWC {
let C = dim[3]
......@@ -96,29 +122,55 @@ class Tensor<P: PrecisionType>: Tensorial {
let paddedC = cSlices * 4
let count = paddedC * dim[0] * dim[1] * dim[2]
if C == paddedC {
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
buffer = device.makeBuffer(length: count * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
}
} else if C == 1 {
buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
buffer = device.makeBuffer(length: numel() * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
}
} else {
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
var tmpPointer = data.pointer
var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count)
buffer = device.makeBuffer(length: count * precisionSize)
let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
var tmpPointer = floatPointer
var dstPtr = convertedPointer
for _ in 0..<dim[0] * dim[1] * dim[2] {
for j in 0..<paddedC {
if j < C {
dstPtr?[j] = tmpPointer[j]
dstPtr[j] = tmpPointer[j]
}
}
tmpPointer += C
dstPtr! += paddedC
dstPtr += paddedC
}
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
}
convertedPointer.deinitialize(count: count)
convertedPointer.deallocate()
}
}
} else if dim.cout() == 1 {
buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
buffer = device.makeBuffer(length: numel() * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
}
} else {
fatalError(" not support !")
}
......
......@@ -68,16 +68,18 @@ public class Texture<P: PrecisionType>: Tensorial {
} else {
fatalError(" not suuprt ")
}
if MemoryLayout<P>.size == 1 {
tmpTextureDes.pixelFormat = .rgba8Unorm
} else if MemoryLayout<P>.size == 2 {
tmpTextureDes.pixelFormat = .rgba32Float
tmpTextureDes.pixelFormat = .rgba16Float
} else if MemoryLayout<P>.size == 4 {
// tmpTextureDes.pixelFormat = .r32Float
tmpTextureDes.pixelFormat = .rgba32Float
}
// tmpTextureDes.pixelFormat = .rgba16Float
tmpTextureDes.usage = [.shaderRead, .shaderWrite]
tmpTextureDes.storageMode = .shared
textureDesc = tmpTextureDes
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册