Merge pull request #556 from codeWorm2015/metal

correct buffer

Merge pull request #556 from codeWorm2015/metal
correct buffer
63e8bb92 · Ruilong Liu · GitHub · a165eaba · 60f66b15 · 63e8bb92
13 changed file
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -29,11 +29,11 @@ class ViewController: UIViewController {
 //    let queue: MTLCommandQueue
    func scaleTexture(queue: MTLCommandQueue, input: MTLTexture, complete: @escaping (MTLTexture) -> Void) {        
        let tmpTextureDes = MTLTextureDescriptor.init()
-        tmpTextureDes.width = 227
-        tmpTextureDes.height = 227
+        tmpTextureDes.width = 224
+        tmpTextureDes.height = 224
        tmpTextureDes.depth = 1
        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-        tmpTextureDes.pixelFormat = .rgba16Float
+        tmpTextureDes.pixelFormat = .rgba32Float
        tmpTextureDes.textureType = .type2D
        tmpTextureDes.storageMode = .shared
        tmpTextureDes.cpuCacheMode = .defaultCache
@@ -64,23 +64,18 @@ class ViewController: UIViewController {
        }
       
        scaleTexture(queue: queue!, input: inTexture) { (inputTexture) in
-            let loader = Loader<Float16>.init()
+            let loader = Loader<Float32>.init()
            do {
                let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
                let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
                let program = try loader.load(device: self.device, modelPath: modelPath, paraPath: paraPath)
-                let executor = try Executor<Float16>.init(inDevice: self.device, inQueue: queue!, inProgram: program)
-                let output = try executor.predict(input: inputTexture, expect: [1, 227, 227, 3])
+                let executor = try Executor<Float32>.init(inDevice: self.device, inQueue: queue!, inProgram: program)
+                let output = try executor.predict(input: inputTexture, expect: [1, 224, 224, 3])
                //            print(output)
            } catch let error {
                print(error)
            }
        }
-        
-        
-        
-       
    }
-
 }

--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -93,7 +93,7 @@ public extension MTLTexture {
        print("texture: \(self)")
        if textureType == .type2DArray {
            for i in 0..<arrayLength{
-                var str: String = "slice: \(i): "
+                var str: String = "slice: \(i): \n"
                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
@@ -142,8 +142,25 @@ public extension MTLTexture {
 }


+public extension MTLBuffer {
+    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+        print(header)
+        print("MTLBuffer: \(self) ")
+        var str = ""
+        if stridable && length/MemoryLayout<T>.stride > 1000{
+            for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
+                str += " \(contents().assumingMemoryBound(to: T.self)[j])"
+            }
+        } else {
+            for i in 0..<length/MemoryLayout<T>.size {
+                str += " \(contents().assumingMemoryBound(to: T.self)[i])"
+            }
+        }
+        print(str)
+        return nil
+}

-
+}




--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -55,17 +55,8 @@ public class Executor<P: PrecisionType> {
        device = inDevice
        queue = inQueue
        for block in inProgram.programDesc.blocks {
-//            for i in 0..<2 {
-//                let op = block.ops[i]
-//                do {
-//                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-//                    op.inferShape()
-//                    ops.append(op)
-//                } catch let error {
-//                    throw error
-//                }
-//            }
-            for op in block.ops {
+            for i in 0..<2 {
+                let op = block.ops[i]
                do {
                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
                    op.inferShape()
@@ -74,6 +65,15 @@ public class Executor<P: PrecisionType> {
                    throw error
                }
            }
+//            for op in block.ops {
+//                do {
+//                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
+//                    op.inferShape()
+//                    ops.append(op)
+//                } catch let error {
+//                    throw error
+//                }
+//            }
        }
    }
    
@@ -95,9 +95,9 @@ public class Executor<P: PrecisionType> {
        
        buffer.addCompletedHandler { (commandbuffer) in
            
-//            for op in self.ops {
-//                op.delogOutput()
-//            }
+            for op in self.ops {
+                op.delogOutput()
+            }
            
            let afterDate = Date.init()
            print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
@@ -114,7 +114,6 @@ public class Executor<P: PrecisionType> {
            throw PaddleMobileError.netError(message: "output var type error")
        }
        
-        
        return output
    }
    

--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Loader.swift
@@ -50,7 +50,7 @@ public class Loader<P: PrecisionType> {
                return pointee
            }
            
-            _ = pointerReader(type: UInt32.self)
+            let _ = pointerReader(type: UInt32.self)
            let lodLevel = pointerReader(type: UInt64.self)
            for _ in 0..<lodLevel {
                let size = pointerReader(type: UInt64.self)
@@ -62,6 +62,7 @@ public class Loader<P: PrecisionType> {
            let _ = pointerReader(type: UInt32.self)
            
            let tensorDescSize = pointerReader(type: Int32.self)
+            
            fseek(file, Int(tensorDescSize), SEEK_CUR)
            nowIndex += Int(tensorDescSize)
            
@@ -70,21 +71,21 @@ public class Loader<P: PrecisionType> {
             */
            
            //现在模型传入模型为  Float 类型, 这块应该根据模型来
-            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+//            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+//            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
            
-//            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
-//            guard bytesRead == tensor.data.size else {
-//                throw PaddleMobileError.loaderError(message: "param read size error")
-//            }
+            guard bytesRead == tensor.data.size else {
+                throw PaddleMobileError.loaderError(message: "param read size error")
+            }
            
            // TODO: use script to convert
-            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-            for i in 0..<tensor.numel() {
-                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-            }
-            tmpPointer.deinitialize(count: tmpCapacity)
-            tmpPointer.deallocate()
+//            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+//            for i in 0..<tensor.numel() {
+//                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+//            }
+//            tmpPointer.deinitialize(count: tmpCapacity)
+//            tmpPointer.deallocate()
            
            nowIndex += bytesRead
        }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -107,7 +107,16 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
    }
    
    func delogOutput() {
+        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+        para.filter.logDataPointer(header: "filter data pointer: ")
+        print("filter: \(para.filter)")
        
+        print("biase: \(para.bias)")
+        
+        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+        
+        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: true)
    }
    
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
@@ -61,8 +61,8 @@ class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<
    
    func delogOutput() {
 //        para.input.mtlTexture.logDesc()
-        let _: Float16? = para.input.mtlTexture.logDesc(header: "feed input: ")
-        let _: Float16? = para.output.metalTexture.logDesc(header: "feed output: ")
+//        let _: P? = para.input.mtlTexture.logDesc(header: "feed input: ", stridable: true)
+//        let _: P? = para.output.metalTexture.logDesc(header: "feed output: ", stridable: true)
    }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -29,7 +29,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
        
        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-            let inv = pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
            invs.append(P(inv))
        }
        
@@ -59,7 +59,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
        }
        
        print("ConvAddBatchNormReluKernel compute")
-        
        encoder.setTexture(param.input.metalTexture, index: 0)
        encoder.setTexture(param.output.metalTexture, index: 1)
        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
@@ -59,13 +59,56 @@ kernel void conv3x3(texture2d_array<half, access::sample> inTexture [[texture(0)
    outTexture.write(output, gid.xy, gid.z);
 }

-kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+//kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+//                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+//                                         constant MetalConvParam &param [[buffer(0)]],
+//                                         const device half4 *weights [[buffer(1)]],
+//                                         const device half4 *biase [[buffer(2)]],
+//                                         const device half4 *new_scale [[buffer(3)]],
+//                                         const device half4 *new_biase [[buffer(4)]],
+//                                         uint3 gid [[thread_position_in_grid]]) {
+//
+//    if (gid.x >= outTexture.get_width() ||
+//        gid.y >= outTexture.get_height() ||
+//        gid.z >= outTexture.get_array_size()) {
+//        return;
+//    }
+//
+//    short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
+//    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+//    const uint wightSliceCount = 36;
+//    uint weithTo = gid.z * wightSliceCount * inTexture.get_array_size();
+//    half4 output = 0.0;
+//    for (uint i = 0; i < inTexture.get_array_size(); ++i) {
+//        half4 input[9];
+//        input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
+//        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
+//        input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
+//        input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
+//        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+//        input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
+//        input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
+//        input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
+//        input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
+//        for (int j = 0; j < 9; ++j) {
+//            half4 weight = weights[weithTo + wightSliceCount * i + j * 4];
+//            output += dot(input[j], weight);
+//        }
+//    }
+//
+//    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0h);
+//    outTexture.write(output, gid.xy, gid.z);
+//
+//}
+
+
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device half4 *weights [[buffer(1)]],
-                                         const device half4 *biase [[buffer(2)]],
-                                         const device half4 *new_scale [[buffer(3)]],
-                                         const device half4 *new_biase [[buffer(4)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
                                         uint3 gid [[thread_position_in_grid]]) {
    
    if (gid.x >= outTexture.get_width() ||
@@ -78,9 +121,9 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> i
    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
    const uint wightSliceCount = 36;
    uint weithTo = gid.z * wightSliceCount * inTexture.get_array_size();
-    half4 output = 0.0;
+    float4 output = 0.0;
    for (uint i = 0; i < inTexture.get_array_size(); ++i) {
-        half4 input[9];
+        float4 input[9];
        input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
        input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
@@ -91,12 +134,12 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> i
        input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
        input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
        for (int j = 0; j < 9; ++j) {
-            half4 weight = weights[weithTo + wightSliceCount * i + j * 4];
+            float4 weight = weights[weithTo + wightSliceCount * i + j * 4];
            output += dot(input[j], weight);
        }
    }
    
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0h);
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
    outTexture.write(output, gid.xy, gid.z);
    
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
@@ -73,15 +73,25 @@ kernel void batchnorm(texture2d_array<half, access::read> inTexture [[texture(0)
    outTexture.write(input, gid.xy, gid.z);
 }

-kernel void texture2d_to_2d_array(texture2d<half, access::read> inTexture [[texture(0)]],
-                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+//kernel void texture2d_to_2d_array(texture2d<half, access::read> inTexture [[texture(0)]],
+//                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+//                               uint3 gid [[thread_position_in_grid]]) {
+//    if (gid.x >= inTexture.get_width() ||
+//        gid.y >= inTexture.get_height()){
+//        return;
+//    }
+//    const half4 input = inTexture.read(gid.xy);
+//    outTexture.write(input, gid.xy, 0);
+//}
+
+kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
+                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
                                  uint3 gid [[thread_position_in_grid]]) {
    if (gid.x >= inTexture.get_width() ||
        gid.y >= inTexture.get_height()){
        return;
    }
-    const half4 input = inTexture.read(gid.xy);
+    const float4 input = inTexture.read(gid.xy);
    outTexture.write(input, gid.xy, 0);
 }

-
--- a/metal/paddle-mobile/paddle-mobile/Program/Attribute.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/Attribute.swift
@@ -32,6 +32,9 @@ extension Int64: Attr {
 extension Array: Attr {
 }

+extension String: Attr {
+}
+
 func attrWithProtoDesc(attrDesc: PaddleMobile_Framework_Proto_OpDesc.Attr) -> Attr {
    switch attrDesc.type {
    case .boolean:
@@ -39,7 +42,7 @@ func attrWithProtoDesc(attrDesc: PaddleMobile_Framework_Proto_OpDesc.Attr) -> At
    case .int:
        return Int(attrDesc.i)
    case .string:
-        return attrDesc.strings
+        return attrDesc.s
    case .long:
        return attrDesc.l
    case .float:

--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -38,7 +38,7 @@ class Tensor<P: PrecisionType>: Tensorial {
            pointer = inPointer
        }
        let size: Int
-        fileprivate var pointer: UnsafeMutablePointer<P>
+        var pointer: UnsafeMutablePointer<P>
        subscript(index: Int) -> P{
            get {
                return pointer[index]
@@ -104,7 +104,7 @@ class Tensor<P: PrecisionType>: Tensorial {
                    for _ in 0..<dim[0] * dim[1] * dim[2] {
                        for j in 0..<paddedC {
                            if j < C {
-                                dstPtr?[j] = data.pointer[j]
+                                dstPtr?[j] = tmpPointer[j]
                            }
                        }
                        tmpPointer += C
@@ -134,7 +134,7 @@ class Tensor<P: PrecisionType>: Tensorial {
            for h in 0..<H{
                for w in 0..<W{
                    for c in 0..<C{
-                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * w + w]
+                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
                        index += 1
                    }
                }
@@ -146,27 +146,25 @@ class Tensor<P: PrecisionType>: Tensorial {


 extension Tensor {
-    var debugDescription: String {
-        var str = ""
    
-//        for i in 0..<buffer.length/MemoryLayout<P>.stride {
-//            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
-//        }
+    var debugDescription: String {
+        var str = "dim: \(dim) \n"
+        str += "MTLBuffer: \(self.buffer) \n"
+        for i in 0..<buffer.length/MemoryLayout<P>.size {
+            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
+        }
        return str
+    }
    
-//        var str = ""
-//        str += "Dim: \(dim) \n value:[ "
-//        if data.size < 20 {
-//            for d in 0..<data.size {
-//                str += " \(data[d]) "
-//            }
-//        } else {
-//            for d in stride(from: 0, to: data.size, by: data.size/20) {
-//                str += " \(data[d]) "
-//            }
-//        }
-//        str += " ]"
-//        return str
+    func logDataPointer(header: String = "") {
+        print(header)
+        var str = ""
+        str += "data size: \(data.size) \n"
+        str += "dim: \(dim) \n"
+        for i in 0..<numel() {
+            str += " \(data.pointer[i])"
+        }
+        print(str)
    }
    
    var description: String {

--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -69,7 +69,7 @@ public class Texture<P: PrecisionType>: Tensorial {
        if MemoryLayout<P>.size == 1 {
            tmpTextureDes.pixelFormat = .rgba8Unorm
        } else if MemoryLayout<P>.size == 2 {
-            tmpTextureDes.pixelFormat = .rgba16Float
+            tmpTextureDes.pixelFormat = .rgba32Float
        } else if MemoryLayout<P>.size == 4 {
 //            tmpTextureDes.pixelFormat = .r32Float
            tmpTextureDes.pixelFormat = .rgba32Float
@@ -130,7 +130,7 @@ extension Texture {
    public var debugDescription: String{
        var str = ""
        str += "Dim: \(dim) \n value:[ "
-//        str += "\(metalTexture)"
+        str += "\(metalTexture)"
        str += " ]"
        return str
    }

--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -19,7 +19,10 @@ limitations under the License. */
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  auto time1 = time();
-  auto program = loader.Load(g_mobilenet, true);
+//  auto program = loader.Load(g_mobilenet_combine, true);
+  auto program = loader.Load(g_mobilenet_combine + "/model",
+                               g_mobilenet_combine + "/params", true);
+
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);