From 92577d1a50518c52a194466e1ac3d2dc6ca7f88f Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Sat, 28 Jul 2018 23:32:29 +0800
Subject: [PATCH] metal cun run

---
 .../xcschemes/paddle-mobile-demo.xcscheme     |   2 +-
 .../paddle-mobile-demo/PreProcessKernel.metal |  16 +-
 .../paddle-mobile-demo/ViewController.swift   |  49 ++--
 .../paddle-mobile/Common/MetalExtension.swift |   4 +-
 .../paddle-mobile/Common/Tools.swift          |   2 +-
 .../paddle-mobile/Executor.swift              |  24 +-
 .../paddle-mobile/paddle-mobile/Loader.swift  |   3 +-
 .../Operators/ConvAddBatchNormReluOp.swift    |  14 +-
 .../Kernels/ConvAddBatchNormReluKernel.swift  |  14 +-
 .../Operators/Kernels/ConvAddKernel.swift     |   3 +
 .../Operators/Kernels/ConvKernel.metal        | 258 ++++++++++++++----
 .../Operators/Kernels/ConvKernel.swift        |   1 +
 .../Operators/Kernels/Kernels.metal           |  78 ++++++
 .../Kernels/Texture2DTo2DArrayKernel.swift    |   2 -
 .../paddle-mobile/framework/Tensor.swift      |  76 +++++-
 .../paddle-mobile/framework/Texture.swift     |   6 +-
 16 files changed, 434 insertions(+), 118 deletions(-)
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme
index 46c65bd36a..de579675e0 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme
@@ -42,7 +42,7 @@
       </AdditionalOptions>
    </TestAction>
    <LaunchAction
-      buildConfiguration = "Debug"
+      buildConfiguration = "Release"
       selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
       selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
       launchStyle = "0"
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
index a9ab6430d6..f359ab39ac 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
@@ -20,10 +20,24 @@ kernel void preprocess(
         return;
     }
     const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-    const float4 inColor = (float4(float4(inTexture.read(gid))) * 255.0f - means) * 0.017f;
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
     outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
+kernel void preprocess_half(
+                       texture2d<half, access::read> inTexture [[texture(0)]],
+                       texture2d<half, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
 
 
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
index 5e96655e76..1d8d50e1c9 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -26,9 +26,12 @@ class PreProccess: CusomKernel {
     }
 }
 
-
 class ViewController: UIViewController {
     var textureLoader: MTKTextureLoader!
+    var program: Program!
+    var executor: Executor<Float32>!
+    var preprocessKernel: PreProccess!
+    
 //    let queue: MTLCommandQueue
     func scaleTexture(queue: MTLCommandQueue, input: MTLTexture, complete: @escaping (MTLTexture) -> Void) {        
         let tmpTextureDes = MTLTextureDescriptor.init()
@@ -57,18 +60,9 @@ class ViewController: UIViewController {
         unitTest.testConvAddBnRelu()
     }
     
-    override func viewDidLoad() {
-        super.viewDidLoad()
-        
-        if openTest {
-            print(" - testing - ")
-            unitTest()
-            return
-        }
-
-        
-        
-//        return
+    override func touchesBegan(_ touches: Set<UITouch>, with event: UIEvent?) {
+        super.touchesBegan(touches, with: event)
+        //        return
         let queue = MetalHelper.shared.queue
         
         textureLoader = MTKTextureLoader.init(device: MetalHelper.shared.device)
@@ -81,22 +75,33 @@ class ViewController: UIViewController {
         guard let inTexture = texture else {
             fatalError(" texture is nil !")
         }
-       
+        
         scaleTexture(queue: queue, input: inTexture) { (inputTexture) in
-            let loader = Loader<Float32>.init()
             do {
-                let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
-                let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
-                let program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath)
-                let executor = try Executor<Float32>.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program)
-                let preprocessKernel = PreProccess.init(device: MetalHelper.shared.device)
-                try executor.predict(input: inputTexture, expect: [1, 224, 224, 3], completionHandle: { (result) in
+                try self.executor.predict(input: inputTexture, expect: [1, 224, 224, 3], completionHandle: { (result) in
                     print(result.resultArr.top(r: 5))
-                }, preProcessKernle: preprocessKernel)
+                }, preProcessKernle: self.preprocessKernel)
             } catch let error {
                 print(error)
             }
         }
     }
+    
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        
+        let queue = MetalHelper.shared.queue
+        let loader = Loader<Float32>.init()
+        preprocessKernel = PreProccess.init(device: MetalHelper.shared.device)
+
+        do {
+            let modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
+            let paraPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
+            program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath)
+            executor = try Executor<Float32>.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program)
+        } catch let error {
+            print(error)
+        }
+    }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
index af4c01d5fe..b750018260 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -120,8 +120,8 @@ extension MTLComputeCommandEncoder {
         let groupDepth = slices
         let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
         
-        print("groups: \(groups) ")
-        print("threads per group: \(threadsPerGroup)")
+//        print("groups: \(groups) ")
+//        print("threads per group: \(threadsPerGroup)")
         
         setComputePipelineState(computePipline)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
index cc1f7a4f21..930198fbf9 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
@@ -8,7 +8,6 @@
 
 import Foundation
 
-
 func writeToLibrary<P: PrecisionType>(fileName: String, array: [P]) {
     let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last ?! " library path get error "
     let filePath = libraryPath + "/" + fileName
@@ -19,3 +18,4 @@ func writeToLibrary<P: PrecisionType>(fileName: String, array: [P]) {
     fileHandler.write(data)
     fileHandler.closeFile()
 }
+
diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift
index e883754b2c..40a80d2ddf 100644
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -57,7 +57,7 @@ public class Executor<P: PrecisionType> {
         queue = inQueue
         for block in inProgram.programDesc.blocks {
             //block.ops.count
-            for i in 0..<2 {
+            for i in 0..<block.ops.count {
                 let op = block.ops[i]
                 do {
                     let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
@@ -109,20 +109,26 @@ public class Executor<P: PrecisionType> {
         }
         
         buffer.addCompletedHandler { (commandbuffer) in
-            let inputArr = resInput.floatArray(res: { (p:P) -> P in
-                return p
-            })
+//            let inputArr = resInput.floatArray(res: { (p:P) -> P in
+//                return p
+//            })
 //            print(inputArr)
             
 //            let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
 //            print(stridableInput)
             
 //            let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-            for op in self.ops {
-                op.delogOutput()
-            }
-            return
+//            for op in self.ops {
+//                op.delogOutput()
+//            }
+//            return
+            
+//            self.ops[2].delogOutput()
+            
             
+            let afterDate = Date.init()
+            print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
+
             guard let outputVar = self.program.scope.output() else {
                 fatalError("output nil")
             }
@@ -134,8 +140,6 @@ public class Executor<P: PrecisionType> {
                 return p
             }))
             completionHandle(resultHodlder)
-            let afterDate = Date.init()
-            print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
         }
         buffer.commit()
     }
diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift
index 31fd21ebd4..472c588430 100644
--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Loader.swift
@@ -15,7 +15,6 @@
 import Foundation
 import SwiftProtobuf
 
-
 public class Loader<P: PrecisionType> {
     class ParaLoader {
         let file: UnsafeMutablePointer<FILE>
@@ -163,7 +162,7 @@ public class Loader<P: PrecisionType> {
                                 throw error
                             }
                             tensor.convert(to: .NHWC)
-                            tensor.initBuffer(device: device)
+//                            tensor.initBuffer(device: device)
                             scope[varDesc.name] = tensor
                         } else {
                             let dim = Dim.init(inDim: tensorDesc.NHWCDim)
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
index 8746ba980d..f24e25b054 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -116,9 +116,17 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
 //        print("padding: \(para.paddings)")
 //        print("stride: \(para.stride)")
         
-        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
-        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
-        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+//        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+//        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+//        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+        
+        let output = para.output.metalTexture.floatArray { (p: P) -> P in
+            return p
+        }
+//
+        writeToLibrary(fileName: "output_112x112x32_2", array: output)
+        print(" write done")
+        
 //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index e8ee935390..0ffe90272f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -58,6 +58,14 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
             super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
         }
         
+        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+        param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+
+        param.variance.initBuffer(device: device)
+        param.mean.initBuffer(device: device)
+        param.scale.initBuffer(device: device)
+        param.bias.initBuffer(device: device)
+        
         let offsetX = param.filter.width/2 - Int(param.paddings[0])
         let offsetY = param.filter.height/2 - Int(param.paddings[1])
         
@@ -70,7 +78,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
         var invs: [P] = []
         let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
         
-        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {            
             let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
             invs.append(P(inv))
         }
@@ -78,7 +86,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
         let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
         let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
         
-        let scaleContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
         let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
         let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
         for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
@@ -100,7 +108,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
             throw PaddleMobileError.predictError(message: " encode is nil")
         }
 
-        print("ConvAddBatchNormReluKernel compute")
         encoder.setTexture(param.input.metalTexture, index: 0)
         encoder.setTexture(param.output.metalTexture, index: 1)
         encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
@@ -117,7 +124,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
             fatalError()
         }
         
-        print("ConvAddBatchNormReluKernel compute")
         encoder.setTexture(param.inputTexture, index: 0)
         encoder.setTexture(param.outputTexture, index: 1)
         var inMetalParam = param.metalParam
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
index 81bfda997c..1f24a1642e 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -21,6 +21,9 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
         let offsetX = param.filter.width/2 - Int(param.paddings[0])
         let offsetY = param.filter.height/2 - Int(param.paddings[1])
         
+        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+        param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+        
         print("offset x: \(offsetX)")
         print("offset y: \(offsetY)")
         
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
index a738d55e39..9d0c6de35e 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
@@ -24,53 +24,58 @@ struct MetalConvParam {
 };
 
 
-//kernel void conv_add_batch_norm_relu_3x3(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-//                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
-//                                         constant MetalConvParam &param [[buffer(0)]],
-//                                         const device half4 *weights [[buffer(1)]],
-//                                         const device half4 *biase [[buffer(2)]],
-//                                         const device half4 *new_scale [[buffer(3)]],
-//                                         const device half4 *new_biase [[buffer(4)]],
-//                                         uint3 gid [[thread_position_in_grid]]) {
-//
-//    if (gid.x >= outTexture.get_width() ||
-//        gid.y >= outTexture.get_height() ||
-//        gid.z >= outTexture.get_array_size()) {
-//        return;
-//    }
-//
-//    short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
-//    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-//    const uint wightSliceCount = 36;
-//    uint weithTo = gid.z * wightSliceCount * inTexture.get_array_size();
-//    half4 output = 0.0;
-//    for (uint i = 0; i < inTexture.get_array_size(); ++i) {
-//        half4 input[9];
-//        input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
-//        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
-//        input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
-//        input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
-//        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-//        input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
-//        input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
-//        input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
-//        input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
-//        for (int j = 0; j < 9; ++j) {
-//            half4 weight = weights[weithTo + wightSliceCount * i + j * 4];
-//            output += dot(input[j], weight);
-//        }
-//    }
-//
-//    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0h);
-//    outTexture.write(output, gid.xy, gid.z);
-//
-//}
+kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device half4 *weights [[buffer(1)]],
+                                         const device half4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = half4(0.0);
+    
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    
+    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+    outTexture.write(output, gid.xy, gid.z);
+}
 
-kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
+                                         const device half4 *weights [[buffer(1)]],
+                                         const device half4 *biase [[buffer(2)]],
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
@@ -89,9 +94,9 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
     uint input_arr_size = inTexture.get_array_size();
     uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 output = float4(0.0);
+    half4 output = half4(0.0);
     
-    float4 input[9];
+    half4 input[9];
     for (uint i = 0; i < input_arr_size; ++i) {
         input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
         input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
@@ -103,23 +108,113 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
         input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
         input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
         for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.x += dot(input[j], weight_x);
             
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.y += dot(input[j], weight_y);
             
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.z += dot(input[j], weight_z);
             
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
             output.w += dot(input[j], weight_w);
         }
     }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+    outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = half4(0.0);
+    
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    output = output + biase[gid.z];
     outTexture.write(output, gid.xy, gid.z);
 }
 
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device half *weights [[buffer(1)]],
+                                                   const device half4 *biase [[buffer(2)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    half4 output = half4(0.0);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+    outTexture.write(output, gid.xy, gid.z);
+}
+
+
+/*---------------------------------------------*/
+
+
+
 kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
@@ -165,6 +260,60 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
     outTexture.write(output, gid.xy, gid.z);
 }
 
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
+}
+
 kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
@@ -208,7 +357,6 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
     outTexture.write(output, gid.xy, gid.z);
 }
 
-
 kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
@@ -224,7 +372,6 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
         return;
     }
     uint output_slice = gid.z;
-    
     ushort2 stride = ushort2(param.strideX, param.strideY);
     ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
     constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
@@ -248,7 +395,6 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
         output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
         output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
     }
-    output = (output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z];
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
     outTexture.write(output, gid.xy, gid.z);
 }
-
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
index 27c4fe1428..92c43fe321 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -31,6 +31,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
         let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
         let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
         let offsetZ = 0.0
+        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
         
         metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
     }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
index 5b797178f5..92ee118452 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
@@ -96,6 +96,17 @@ kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[tex
 }
 
 
+kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
+                                  texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height()){
+        return;
+    }
+    const half4 input = inTexture.read(gid.xy);
+    outTexture.write(input, gid.xy, 0);
+}
+
 struct PoolParam {
     int ksizeX;
     int ksizeY;
@@ -140,6 +151,39 @@ kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
 }
 
 
+kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                 texture2d_array<half, access::write> outTexture [[texture(1)]],
+                 constant PoolParam &pm [[buffer(0)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    int xmin = gid.x * pm.strideX - pm.paddingX;
+    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+    xmin = max(xmin, 0);
+    int ymin = gid.y * pm.strideX - pm.paddingX;
+    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+    ymin = max(ymin, 0);
+    
+    half4 r = 0;
+    if (pm.poolType == 0) {
+        r = inTexture.read(uint2(xmin, ymin), gid.z);
+        for (int x = xmin; x < xmax; x++) {
+            for (int y = ymin; y < ymax; y++) {
+                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+            }
+        }
+    } else if (pm.poolType == 1) {
+        for (int x = xmin; x < xmax; x++) {
+            for (int y = ymin; y < ymax; y++) {
+                r += inTexture.read(uint2(x, y), gid.z);
+            }
+        }
+        r /= pm.ksizeX * pm.ksizeY;
+    }
+    outTexture.write(r, gid.xy, gid.z);
+}
+
 kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]],
                     texture2d_array<float, access::write> outTexture [[texture(1)]],
                     uint3 gid [[thread_position_in_grid]]) {
@@ -151,6 +195,17 @@ kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]
     outTexture.write(r, gid.xy, gid.z);
 }
 
+kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    
+    half4 r = inTexture.read(uint2(0, 0), gid.z);
+    outTexture.write(r, gid.xy, gid.z);
+}
+
 kernel void softmax(texture2d_array<float, access::read> inTexture [[texture(0)]],
                     texture2d_array<float, access::write> outTexture [[texture(1)]],
                     uint3 gid [[thread_position_in_grid]]) {
@@ -172,3 +227,26 @@ kernel void softmax(texture2d_array<float, access::read> inTexture [[texture(0)]
     rr = exp(rr - maxv) / sum;
     outTexture.write(rr, gid.xy, gid.z);
 }
+
+
+kernel void softmax_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    int zsize = inTexture.get_array_size();
+    half maxv = inTexture.read(uint2(0, 0), 0)[0];
+    for (int z = 0; z < zsize; z++) {
+        half4 r = inTexture.read(uint2(0, 0), z);
+        maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3])));
+    }
+    float sum = 0;
+    for (int z = 0; z < zsize; z++) {
+        half4 r = inTexture.read(uint2(0, 0), z);
+        sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv);
+    }
+    half4 rr = inTexture.read(gid.xy, gid.z);
+    rr = exp(rr - maxv) / sum;
+    outTexture.write(rr, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
index 3bf912c873..b524c3ac80 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -20,7 +20,6 @@ struct Texture2DTo2DArrayParam {
     let expectDim: Dim
 }
 
-
 class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
     func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
         guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
@@ -36,4 +35,3 @@ class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
         super.init(device: device, inFunctionName: "texture2d_to_2d_array")
     }
 }
-
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
index 39891c77de..beed19b653 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -12,6 +12,7 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
+import Accelerate
 import Foundation
 
 protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
@@ -27,6 +28,10 @@ extension Tensorial {
 }
 
 class Tensor<P: PrecisionType>: Tensorial {
+    enum BufferPrecision {
+        case Float32, Float16
+    }
+    
     var data: Data
     var dim: Dim
     var buffer: MTLBuffer!
@@ -88,7 +93,28 @@ class Tensor<P: PrecisionType>: Tensorial {
         layout = to
     }
     
-    func initBuffer(device: MTLDevice) {
+    func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
+        var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+        var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+        guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+            fatalError(" float 32 to float 16 error ! ")
+        }
+    }
+    
+    func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
+        guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
+            fatalError(" not support yet ")
+        }
+        
+        
+        let precisionSize: Int
+        switch precision {
+        case .Float32:
+            precisionSize = 4
+        case .Float16:
+            precisionSize = 2
+        }
+        
         if dim.cout() == 4 {
             if layout == .NHWC {
                 let C = dim[3]
@@ -96,29 +122,55 @@ class Tensor<P: PrecisionType>: Tensorial {
                 let paddedC = cSlices * 4
                 let count = paddedC * dim[0] * dim[1] * dim[2]
                 if C == paddedC {
-                    buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
-                    buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+                    }
                 } else if C == 1 {
-                    buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
-                    buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+                    buffer = device.makeBuffer(length: numel() * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+                    }
                 } else {
-                    buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
-                    var tmpPointer = data.pointer
-                    var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count)
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+                    var tmpPointer = floatPointer
+                    var dstPtr = convertedPointer
                     for _ in 0..<dim[0] * dim[1] * dim[2] {
                         for j in 0..<paddedC {
                             if j < C {
-                                dstPtr?[j] = tmpPointer[j]
+                                dstPtr[j] = tmpPointer[j]
                             }
                         }
                         tmpPointer += C
-                        dstPtr! += paddedC
+                        dstPtr += paddedC
+                    }
+                    
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
                     }
+                    
+                    convertedPointer.deinitialize(count: count)
+                    convertedPointer.deallocate()
                 }
             }
         } else if dim.cout() == 1 {
-            buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
-            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+            buffer = device.makeBuffer(length: numel() * precisionSize)
+            switch precision {
+            case .Float32:
+                buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+            case .Float16:
+                float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+            }
         } else {
             fatalError(" not support !")
         }
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
index 50f9f7d067..81894664c5 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -68,16 +68,18 @@ public class Texture<P: PrecisionType>: Tensorial {
         } else {
             fatalError(" not suuprt ")
         }
+        
         if MemoryLayout<P>.size == 1 {
             tmpTextureDes.pixelFormat = .rgba8Unorm
         } else if MemoryLayout<P>.size == 2 {
-            tmpTextureDes.pixelFormat = .rgba32Float
+            tmpTextureDes.pixelFormat = .rgba16Float
         } else if MemoryLayout<P>.size == 4 {
 //            tmpTextureDes.pixelFormat = .r32Float
             tmpTextureDes.pixelFormat = .rgba32Float
 
         }
-        
+//        tmpTextureDes.pixelFormat = .rgba16Float
+
         tmpTextureDes.usage = [.shaderRead, .shaderWrite]
         tmpTextureDes.storageMode = .shared
         textureDesc = tmpTextureDes
-- 
GitLab