提交 cead34f5 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge pull request #609 from codeWorm2015/metal

add conv add imp
......@@ -55,6 +55,7 @@ public class Executor<P: PrecisionType> {
device = inDevice
queue = inQueue
for block in inProgram.programDesc.blocks {
//block.ops.count
for i in 0..<block.ops.count {
let op = block.ops[i]
do {
......@@ -65,6 +66,7 @@ public class Executor<P: PrecisionType> {
throw error
}
}
// for op in block.ops {
// do {
// let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
......@@ -94,16 +96,14 @@ public class Executor<P: PrecisionType> {
}
buffer.addCompletedHandler { (commandbuffer) in
for op in self.ops {
op.delogOutput()
}
let afterDate = Date.init()
print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
}
buffer.commit()
guard let outputVar = program.scope.output() else {
......
......@@ -107,17 +107,16 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
}
func delogOutput() {
// let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
// let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: true)
// para.filter.logDataPointer(header: "filter data pointer: ")
//
// print("filter: \(para.filter)")
// print("biase: \(para.bias)")
// print("padding: \(para.paddings)")
// print("stride: \(para.stride)")
//
// let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
// let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
// let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: true)
// let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
}
}
......@@ -22,7 +22,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_1x1")
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
} else {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
}
......@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(param.bias.buffer, offset: 0, index: 2)
encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
encoder.setBuffer(param.newScale!, offset: 0, index: 3)
encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
......
......@@ -15,11 +15,31 @@
import Foundation
class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddParam<P>) {
super.init(device: device, inFunctionName: "conv_add_1x1")
let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
print("offset x: \(offsetX)")
print("offset y: \(offsetY)")
let offsetZ = 0.0
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
}
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
print("Conv Add compute")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
}
......@@ -66,7 +66,6 @@ struct MetalConvParam {
//
//}
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
......@@ -120,8 +119,6 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
......@@ -165,14 +162,11 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
......@@ -210,10 +204,10 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
}
kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]],
const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
......@@ -225,11 +219,10 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
return;
}
uint output_slice = gid.z;
short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
......@@ -243,13 +236,12 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
float4 input = inputs[j];
float4 weight = weights[weithTo + j];
output.x += input.x * weight.x;
output.y += input.y * weight.y;
output.z += input.z * weight.z;
output.w += input.w * weight.w;
output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
}
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
output = (output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z];
outTexture.write(output, gid.xy, gid.z);
}
......@@ -95,12 +95,14 @@ class Tensor<P: PrecisionType>: Tensorial {
let cSlices = (C + 3) / 4
let paddedC = cSlices * 4
let count = paddedC * dim[0] * dim[1] * dim[2]
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
if C == paddedC {
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
} else if C == 1 {
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
} else {
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
var tmpPointer = data.pointer
var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count)
for _ in 0..<dim[0] * dim[1] * dim[2] {
......@@ -120,7 +122,8 @@ class Tensor<P: PrecisionType>: Tensorial {
} else {
fatalError(" not support !")
}
data.release()
//TODO: release
// data.release()
}
var width: Int {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册