Merge pull request #609 from codeWorm2015/metal

add conv add imp

Merge pull request #609 from codeWorm2015/metal
add conv add imp
cead34f5 · Ruilong Liu · GitHub · 4e101470 · 8737957f · cead34f5
6 changed file
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -55,6 +55,7 @@ public class Executor<P: PrecisionType> {
        device = inDevice
        queue = inQueue
        for block in inProgram.programDesc.blocks {
+            //block.ops.count
            for i in 0..<block.ops.count {
                let op = block.ops[i]
                do {
@@ -65,6 +66,7 @@ public class Executor<P: PrecisionType> {
                    throw error
                }
            }
 //            for op in block.ops {
 //                do {
 //                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
@@ -94,16 +96,14 @@ public class Executor<P: PrecisionType> {
        }
        buffer.addCompletedHandler { (commandbuffer) in
            for op in self.ops {
                op.delogOutput()
            }
            let afterDate = Date.init()
            print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
        }
        buffer.commit()
        guard let outputVar = program.scope.output() else {

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -107,17 +107,16 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
    }
    func delogOutput() {
-//        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+//        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: true)
 //        para.filter.logDataPointer(header: "filter data pointer: ")
-//        
 //        print("filter: \(para.filter)")
 //        print("biase: \(para.bias)")
 //        print("padding: \(para.paddings)")
 //        print("stride: \(para.stride)")
-//        
 //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
 //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
-//        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: true)
+//        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -22,7 +22,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
        if param.filter.width == 1 && param.filter.height == 1 {
            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
        } else if param.filter.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_1x1")
+            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
        } else {
            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
        }
@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
        encoder.setTexture(param.output.metalTexture, index: 1)
        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.bias.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -15,11 +15,31 @@
 import Foundation
 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
+    var metalParam: MetalConvParam!
    required init(device: MTLDevice, param: ConvAddParam<P>) {
        super.init(device: device, inFunctionName: "conv_add_1x1")
+        let offsetX = param.filter.width/2 - Int(param.paddings[0])
+        let offsetY = param.filter.height/2 - Int(param.paddings[1])
+        print("offset x: \(offsetX)")
+        print("offset y: \(offsetY)")
+        let offsetZ = 0.0
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
    }
    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        print("Conv Add compute")
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
@@ -66,7 +66,6 @@ struct MetalConvParam {
 //
 //}
 kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
                                         constant MetalConvParam &param [[buffer(0)]],
@@ -120,8 +119,6 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
    outTexture.write(output, gid.xy, gid.z);
 }
 kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
                                         constant MetalConvParam &param [[buffer(0)]],
@@ -165,14 +162,11 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
    outTexture.write(output, gid.xy, gid.z);
 }
 kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
                                         constant MetalConvParam &param [[buffer(0)]],
                                         const device float4 *weights [[buffer(1)]],
                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
                                         uint3 gid [[thread_position_in_grid]]) {
    if (gid.x >= outTexture.get_width() ||
@@ -210,10 +204,10 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
 }
-kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
+                                         const device float *weights [[buffer(1)]],
                                         const device float4 *biase [[buffer(2)]],
                                         const device float4 *new_scale [[buffer(3)]],
                                         const device float4 *new_biase [[buffer(4)]],
@@ -225,11 +219,10 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
        return;
    }
    uint output_slice = gid.z;
    short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW;
+    uint weithTo = gid.z * kernelHXW * 4;
    float4 output = float4(0.0);
    float4 inputs[9];
    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
@@ -243,13 +236,12 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
    for (int j = 0; j < 9; ++j) {
        float4 input = inputs[j];
-        float4 weight = weights[weithTo + j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.x += input.x * weight.x;
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.y += input.y * weight.y;
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.z += input.z * weight.z;
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-        output.w += input.w * weight.w;
    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    output = (output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z];
    outTexture.write(output, gid.xy, gid.z);
 }
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -95,12 +95,14 @@ class Tensor<P: PrecisionType>: Tensorial {
                let cSlices = (C + 3) / 4
                let paddedC = cSlices * 4
                let count = paddedC * dim[0] * dim[1] * dim[2]
-                buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
                if C == paddedC {
+                    buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
                    buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
                } else if C == 1 {
-                    buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
+                    buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
                } else {
+                    buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
                    var tmpPointer = data.pointer
                    var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count)
                    for _ in 0..<dim[0] * dim[1] * dim[2] {
@@ -120,7 +122,8 @@ class Tensor<P: PrecisionType>: Tensorial {
        } else {
            fatalError(" not support !")
        }
-        data.release()
+        //TODO: release
+//        data.release()
    }
    var width: Int {