提交 cead34f5 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge pull request #609 from codeWorm2015/metal

add conv add imp
...@@ -55,6 +55,7 @@ public class Executor<P: PrecisionType> { ...@@ -55,6 +55,7 @@ public class Executor<P: PrecisionType> {
device = inDevice device = inDevice
queue = inQueue queue = inQueue
for block in inProgram.programDesc.blocks { for block in inProgram.programDesc.blocks {
//block.ops.count
for i in 0..<block.ops.count { for i in 0..<block.ops.count {
let op = block.ops[i] let op = block.ops[i]
do { do {
...@@ -65,6 +66,7 @@ public class Executor<P: PrecisionType> { ...@@ -65,6 +66,7 @@ public class Executor<P: PrecisionType> {
throw error throw error
} }
} }
// for op in block.ops { // for op in block.ops {
// do { // do {
// let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope) // let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
...@@ -94,16 +96,14 @@ public class Executor<P: PrecisionType> { ...@@ -94,16 +96,14 @@ public class Executor<P: PrecisionType> {
} }
buffer.addCompletedHandler { (commandbuffer) in buffer.addCompletedHandler { (commandbuffer) in
for op in self.ops { for op in self.ops {
op.delogOutput() op.delogOutput()
} }
let afterDate = Date.init() let afterDate = Date.init()
print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))") print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
} }
buffer.commit() buffer.commit()
guard let outputVar = program.scope.output() else { guard let outputVar = program.scope.output() else {
......
...@@ -107,17 +107,16 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer ...@@ -107,17 +107,16 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
} }
func delogOutput() { func delogOutput() {
// let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false) // let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: true)
// para.filter.logDataPointer(header: "filter data pointer: ") // para.filter.logDataPointer(header: "filter data pointer: ")
//
// print("filter: \(para.filter)") // print("filter: \(para.filter)")
// print("biase: \(para.bias)") // print("biase: \(para.bias)")
// print("padding: \(para.paddings)") // print("padding: \(para.paddings)")
// print("stride: \(para.stride)") // print("stride: \(para.stride)")
//
// let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false) // let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
// let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false) // let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
// let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: true) // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
} }
} }
...@@ -22,7 +22,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable { ...@@ -22,7 +22,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_1x1") super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
} else { } else {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
} }
...@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable { ...@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
encoder.setTexture(param.output.metalTexture, index: 1) encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0) encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(param.bias.buffer, offset: 0, index: 2) encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
encoder.setBuffer(param.newScale!, offset: 0, index: 3) encoder.setBuffer(param.newScale!, offset: 0, index: 3)
encoder.setBuffer(param.newBiase!, offset: 0, index: 4) encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
......
...@@ -15,11 +15,31 @@ ...@@ -15,11 +15,31 @@
import Foundation import Foundation
class ConvAddKernel<P: PrecisionType>: Kernel, Computable { class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddParam<P>) { required init(device: MTLDevice, param: ConvAddParam<P>) {
super.init(device: device, inFunctionName: "conv_add_1x1") super.init(device: device, inFunctionName: "conv_add_1x1")
let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1])
print("offset x: \(offsetX)")
print("offset y: \(offsetY)")
let offsetZ = 0.0
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
}
print("Conv Add compute")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
} }
} }
...@@ -66,7 +66,6 @@ struct MetalConvParam { ...@@ -66,7 +66,6 @@ struct MetalConvParam {
// //
//} //}
kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
...@@ -120,8 +119,6 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> ...@@ -120,8 +119,6 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
...@@ -165,14 +162,11 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> ...@@ -165,14 +162,11 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]], const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
...@@ -210,10 +204,10 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text ...@@ -210,10 +204,10 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
} }
kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]], constant MetalConvParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float *weights [[buffer(1)]],
const device float4 *biase [[buffer(2)]], const device float4 *biase [[buffer(2)]],
const device float4 *new_scale [[buffer(3)]], const device float4 *new_scale [[buffer(3)]],
const device float4 *new_biase [[buffer(4)]], const device float4 *new_biase [[buffer(4)]],
...@@ -225,11 +219,10 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access ...@@ -225,11 +219,10 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
return; return;
} }
uint output_slice = gid.z; uint output_slice = gid.z;
short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY); short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9; const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW; uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0); float4 output = float4(0.0);
float4 inputs[9]; float4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
...@@ -243,13 +236,12 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access ...@@ -243,13 +236,12 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) { for (int j = 0; j < 9; ++j) {
float4 input = inputs[j]; float4 input = inputs[j];
float4 weight = weights[weithTo + j]; output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
output.x += input.x * weight.x; output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
output.y += input.y * weight.y; output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
output.z += input.z * weight.z; output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
output.w += input.w * weight.w;
} }
output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); output = (output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z];
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
...@@ -95,12 +95,14 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -95,12 +95,14 @@ class Tensor<P: PrecisionType>: Tensorial {
let cSlices = (C + 3) / 4 let cSlices = (C + 3) / 4
let paddedC = cSlices * 4 let paddedC = cSlices * 4
let count = paddedC * dim[0] * dim[1] * dim[2] let count = paddedC * dim[0] * dim[1] * dim[2]
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
if C == paddedC { if C == paddedC {
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride) buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
} else if C == 1 { } else if C == 1 {
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride) buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
} else { } else {
buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
var tmpPointer = data.pointer var tmpPointer = data.pointer
var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count) var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count)
for _ in 0..<dim[0] * dim[1] * dim[2] { for _ in 0..<dim[0] * dim[1] * dim[2] {
...@@ -120,7 +122,8 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -120,7 +122,8 @@ class Tensor<P: PrecisionType>: Tensorial {
} else { } else {
fatalError(" not support !") fatalError(" not support !")
} }
data.release() //TODO: release
// data.release()
} }
var width: Int { var width: Int {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册