diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
index 8f61f4a88a7bcbe39bbb56e22ef203803776fdec..3ac3272f7fe5e82e15488422c949e3df19853f90 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
@@ -7,7 +7,7 @@
 		<key>paddle-mobile-demo.xcscheme</key>
 		<dict>
 			<key>orderHint</key>
-			<integer>2</integer>
+			<integer>1</integer>
 		</dict>
 	</dict>
 	<key>SuppressBuildableAutocreation</key>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
index 9bcb7c191fd1032d1ef98c0b49d6c483bca0b5df..322a6b9f67cb7df2a58679496e3c8a31d08844b5 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
@@ -33,7 +33,7 @@ class MobileNet_ssd_hand: Net{
     return " \(res)"
   }
 
-  func fetchResult(paddleMobileRes: ResultHolder<Float32>) -> [Float32]{
+  func fetchResult(paddleMobileRes: ResultHolder<Float32>) -> [Float32] {
 
     guard let interRes = paddleMobileRes.intermediateResults else {
       fatalError(" need have inter result ")
@@ -46,12 +46,13 @@ class MobileNet_ssd_hand: Net{
     guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
       fatalError()
     }
-
+    
     var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.originDim[0], h: score.originDim[1], w: score.originDim[2], c: score.originDim[3]))
+    
     var bboxArr = bbox.metalTexture.floatArray { (f) -> Float32 in
       return f
     }
-
+    
     let nmsCompute = NMSCompute.init()
     nmsCompute.scoreThredshold = 0.01
     nmsCompute.nmsTopK = 200
@@ -79,6 +80,7 @@ class MobileNet_ssd_hand: Net{
   let modelDir: String
   
   
+  
 //  let paramPointer: UnsafeMutableRawPointer
 //
 //  let paramSize: Int
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
index 75d9dc618ddb9c8d8da1d33f87c1598d78c1edf0..edbb19c980102796a504252bf35ebc9a6d0513fb 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
@@ -85,6 +85,17 @@ kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(
   outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
+kernel void genet_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
 kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
   if (gid.x >= outTexture.get_width() ||
       gid.y >= outTexture.get_height()) return;
@@ -95,12 +106,13 @@ kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], tex
   outTexture.write(input, gid);
 }
 
-
-
-
-
-
-
-
-
+kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(half4(input), gid);
+}
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
index e2351d14e317ada506d1f15c59efda22c984cab8..a9fd4dd515ac76a59eeb98f71557f64e6e3f2058 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -79,7 +79,7 @@ class ViewController: UIViewController {
       return
     }
     do {
-      let max = 10
+      let max = 50
       let startDate = Date.init()
       for i in 0..<max {
         try net.predict(inTexture: inTexture) { [weak self] (result) in
@@ -87,7 +87,7 @@ class ViewController: UIViewController {
             fatalError()
           }
           
-          print(result.resultArray)
+//          print(result.resultArray)
           if i == max - 1 {
             let time = Date.init().timeIntervalSince(startDate)
             DispatchQueue.main.async {
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
index e04ab7bd0d5b6c183cb8d73914a50440f4bfb10b..55dfe91ce0a700ff7c47cdff55b461908f1657ad 100644
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -12,6 +12,8 @@
 		4AF928822135673D005B6C3A /* Concat.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* Concat.metal */; };
 		4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9288321357BE3005B6C3A /* Elementwise.metal */; };
 		D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */; };
+		FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226552138F33800F395E2 /* TransposeKernel.metal */; };
+		FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226572138F38D00F395E2 /* PoolKernel.metal */; };
 		FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */ = {isa = PBXBuildFile; fileRef = FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9420E11C9A0081E9F8 /* Extensions.swift */; };
 		FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9520E11C9A0081E9F8 /* Errors.swift */; };
@@ -53,6 +55,9 @@
 		FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */; };
 		FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1642132A5EB00084FE5 /* Common.metal */; };
 		FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */; };
+		FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD42138272900BD58AA /* ConvAddMetal.metal */; };
+		FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */; };
+		FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */; };
 		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
 		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
 		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
@@ -97,6 +102,8 @@
 		CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.debug.xcconfig"; sourceTree = "<group>"; };
 		DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.release.xcconfig"; sourceTree = "<group>"; };
+		FC0226552138F33800F395E2 /* TransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
+		FC0226572138F38D00F395E2 /* PoolKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PoolKernel.metal; sourceTree = "<group>"; };
 		FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = paddle_mobile.h; sourceTree = "<group>"; };
 		FC039B6E20E11C3C0081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
@@ -141,6 +148,9 @@
 		FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
 		FCA3A1642132A5EB00084FE5 /* Common.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = "<group>"; };
 		FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = "<group>"; };
+		FCA67CD42138272900BD58AA /* ConvAddMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
+		FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = "<group>"; };
+		FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = "<group>"; };
 		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
 		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
 		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
@@ -372,6 +382,11 @@
 				FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */,
 				FCA3A1642132A5EB00084FE5 /* Common.metal */,
 				FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */,
+				FCA67CD42138272900BD58AA /* ConvAddMetal.metal */,
+				FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */,
+				FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */,
+				FC0226552138F33800F395E2 /* TransposeKernel.metal */,
+				FC0226572138F38D00F395E2 /* PoolKernel.metal */,
 			);
 			path = metal;
 			sourceTree = "<group>";
@@ -478,6 +493,7 @@
 			files = (
 				FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
 				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
+				FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */,
 				4AF9287921341661005B6C3A /* Softmax.metal in Sources */,
 				FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
 				FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
@@ -493,12 +509,15 @@
 				FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
 				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
 				FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
+				FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
 				FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
+				FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */,
 				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
 				FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
 				4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */,
 				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
+				FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */,
 				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
 				FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */,
 				FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */,
@@ -550,6 +569,7 @@
 				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
 				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
 				FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
+				FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */,
 				FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
 				FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
 				FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */,
diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
index e250c37c49d642ce3a4aa5610c4d0dbf3a9ce9fb..352d036e0082c5713a1ab2b8c86a053f5da0d446 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -341,14 +341,27 @@ public extension MTLTexture {
   }
   
   // n c h w - dim
-  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
+  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] {
 //    print("origin dim: \(dim)")
     print("texture: ")
     print(self)
+    var textureArray: [Float32]
+//    if texturePrecision == .Float16
     
-    let textureArray = floatArray { (i : Float32) -> Float32 in
-      return i
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
     }
+    
     var output: [Float32] = []
     for s in 0..<arrayLength {
       for c in 0..<4{
@@ -366,16 +379,27 @@ public extension MTLTexture {
     return output
   }
   
-  func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
+  func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] {
 //    print("origin dim: \(dim)")
 //    print("texture: ")
 //    print(self)
     
-    let textureArray = floatArray { (i : Float32) -> Float32 in
-      return i
+    var textureArray: [Float32]
+    //    if texturePrecision == .Float16
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
     }
+    
     var output: [Float32] = []
-
     let numOfASlice = dim.h * dim.w * 4
     for h in 0..<dim.h {
       for w in 0..<dim.w {
@@ -394,7 +418,6 @@ public extension MTLTexture {
         }
       }
     }
-//    print(" tensor count -- \(output.count)")
     return output
   }
   
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
index d294cc2b08f1a1e8225130981bdc4c702d1559cc..9e5a66dbd2f0ace1f4f727fffad6c94b9061b457 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
@@ -13,6 +13,7 @@
  limitations under the License. */
 
 import Foundation
+import Accelerate
 
 public protocol SummableMultipliable: Equatable {
   static func +(lhs: Self, rhs: Self) -> Self
@@ -78,6 +79,28 @@ extension Float32: PrecisionType {
   }
 }
 
+public func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
+  var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+  var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+  guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+    fatalError(" float 32 to float 16 error ! ")
+  }
+}
+
+public func float16To32(input: UnsafeMutablePointer<Float16>, count: Int) -> [Float32] {
+  var output = Array<Float>.init(repeating: 0.0, count: count)
+  float16to32(input: input, output: &output, count: count)
+  return output
+}
+
+public func float16to32(input: UnsafeMutablePointer<Float16>, output: UnsafeMutablePointer<Float32>, count: Int) {
+  var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
+  var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
+  if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
+    fatalError(" convert float16 to float32 error")
+  }
+}
+
 // N - 0   C - 1   H - 2   W - 3
 struct DataLayout {
   
diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift
index 47c91de096ebaae1899a5ee6a0d4b85f5b1f99da..efdc0b2164de43158f4fd7477c587582e2098bcf 100644
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -16,6 +16,8 @@ import Foundation
 
 let testTo = 54
 
+let computePrecision: ComputePrecision = .Float32
+
 public class ResultHolder<P: PrecisionType> {
   public let dim: [Int]
   public let resultArr: [P]
@@ -66,7 +68,6 @@ public class Executor<P: PrecisionType> {
         let op = block.ops[i]
         do {
           let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-//          op.inferShape()
           ops.append(op)
         } catch let error {
           throw error
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
index eaa596e071b7628339be185a7e3599a370763041..1bf5cde92eba79dbd8be2ca8cbd17e0398c428d5 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -58,28 +58,26 @@ class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>
   }
   
   func delogOutput() {
-    
     print(" \(type) output: ")
-    let priorBoxOriginDim = para.priorBox.originDim
-    let priorBoxArray = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3]))
-    print(" prior box ")
-    print(priorBoxArray.strideArray())
-    
-    let priorBoxVarOriginDim = para.priorBoxVar.originDim
-    let priorBoxVarArray = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3]))
-    print(" prior box var ")
-    print(priorBoxVarArray.strideArray())
-    
-    let targetBoxOriginDim = para.targetBox.originDim
-    let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]))
-    print(" target box ")
-    print(targetBoxArray.strideArray())
+//    let priorBoxOriginDim = para.priorBox.originDim
+//    let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3]))
+//    print(" prior box ")
+//    print(priorBoxArray.strideArray())
+//
+//    let priorBoxVarOriginDim = para.priorBoxVar.originDim
+//    let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3]))
+//    print(" prior box var ")
+//    print(priorBoxVarArray.strideArray())
+//
+//    let targetBoxOriginDim = para.targetBox.originDim
+//    let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]))
+//    print(" target box ")
+//    print(targetBoxArray.strideArray())
     
     let originDim = para.output.originDim
     
-    let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
     print(outputArray.strideArray())
-    
   }
   
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
index 117fd8c39fadec55b88cc1aeaf3b91e5a0dd966d..aac56ef4f9ffd9711791c3bc7e2ca11702fc4e7b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -66,9 +66,16 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run
   func delogOutput() {
     print(" \(type) output: ")
     let originDim = para.output.originDim
-    let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
-    print(outputArray.strideArray())
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray())
+    } else {
+      fatalError(" not implemet")
+    }
+    
   }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
index 6f67014444e5ef82fe4cdc30f99bc371fef2d417..7bced214bd11bfef61eb405d59073f004e765e03 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -125,13 +125,6 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
     //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
     //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
     
-    let output = para.output.metalTexture.floatArray { (p: P) -> P in
-      return p
-    }
-    //
-    writeToLibrary(fileName: "output_112x112x32_2", array: output)
-    print(" write done")
-    
     //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
index c42e5fa1d8a5de54c4ab4d251097eb876411a350..5e344014188061c3dbb411226b2655a3bc2659b8 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -46,9 +46,6 @@ class ConvAddParam<P: PrecisionType>: OpParam {
 class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
   typealias OpType = ConvAddOp<P>
 
- 
-  
-  
   static func fusionNode() -> Node {
     let beginNode = Node.init(inType: gConvType)
     _ = beginNode
@@ -64,7 +61,6 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
     return gConvAddType
   }
   
-  
   func inferShape() {
     
     let inDims = para.input.dim
@@ -101,10 +97,8 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
     print(para.stride)
     print("dilations: ")
     print(para.dilations)
-    
-    
-    
     print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
index 3c521a2210614550577369c603dbbdc5e2cb6692..be8c57d3ace01dabd652e0e80a43c5a053213e28 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
@@ -110,7 +110,7 @@ class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluPa
   
   func delogOutput() {
     print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
   }
   
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
index ba83de1bf892527737a3a1447352877919c8f18e..387fa420b68f8004a12af85ca398cf306f41a5c6 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
@@ -46,10 +46,10 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
     print(" \(type) output: ")
     let originDim = para.output.originDim
     if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
       print(outputArray.strideArray())
     } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
     } else {
       print(" not implement")
     }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
index 639c22ce12c7a110cf58f3f9e7b9ee458d393260..36f477bc1cb48007b5b28bf27a7424940918025b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
@@ -58,6 +58,6 @@ class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runa
   
   func delogOutput() {
     print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
index 16a42d5c7b24e7b3a26cab35f68decd226076876..0ea8a62c5c0bf30da200add2a96410136d2f40fb 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
@@ -65,6 +65,6 @@ class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNRelu
   
   func delogOutput() {
     print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
index 4812f051820385727e08ad79f40f7820bb3310f5..0f96b204d59f3d4a0dd0fae20340811855421c95 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -56,31 +56,30 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
 //    para.output.dim = para.input.dim
   }
   
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
   func delogOutput() {
-    print(" \(type) inputX: ")
-    print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
-    print(" \(type) inputY: ")
-    print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
+//    print(" \(type) inputX: ")
+//    print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
+//    print(" \(type) inputY: ")
+//    print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
     
     print(" \(type) output: ")
     let originDim = para.output.originDim
     if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
       print(outputArray.strideArray())
     } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
     } else {
       print(" not implement")
     }
-    
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
-    }
   }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
index 530fb8a32b1aa97b6a61ed6f5f2d8a77f453a384..f58358761f820809685510fa4e9b5ff237567b3c 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
@@ -19,68 +19,76 @@ public protocol TestParam {
 }
 
 public protocol Testable {
-    associatedtype TestParamType: TestParam
-    func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
-    init(device: MTLDevice, testParam: TestParamType)
+  associatedtype TestParamType: TestParam
+  func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
+  init(device: MTLDevice, testParam: TestParamType)
 }
 
 
 protocol Computable {
-    associatedtype ParamType: OpParam
-    func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
-    init(device: MTLDevice, param: ParamType)
+  associatedtype ParamType: OpParam
+  func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
+  init(device: MTLDevice, param: ParamType)
 }
 
 protocol KernelProtocol {
-    var pipline: MTLComputePipelineState { get set }
-    var functionName: String { get set }
-   
+  var pipline: MTLComputePipelineState { get set }
+  var functionName: String { get set }
+  
 }
 
 open class Kernel {
-    let pipline: MTLComputePipelineState
-    let functionName: String
-    public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
-        pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
-        functionName = inFunctionName
-    }
+  let pipline: MTLComputePipelineState
+  let functionName: String
+  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
+    pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
+    functionName = inFunctionName
+  }
 }
 
 open class CusomKernel: Kernel {
-    public struct Shape {
-        public let width: Int
-        public let height: Int
-        public let channel: Int
-        public init(inWidth: Int, inHeight: Int, inChannel: Int){
-            width = inWidth
-            height = inHeight
-            channel = inChannel
-        }
-    }
-    public let outputTexture: MTLTexture
-    public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.textureType = .type2D
-        textureDesc.width = outputDim.width
-        textureDesc.height = outputDim.height
-        textureDesc.depth = (outputDim.channel + 3) / 4
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.storageMode = .shared
-        outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-
-        super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+  public struct Shape {
+    public let width: Int
+    public let height: Int
+    public let channel: Int
+    public init(inWidth: Int, inHeight: Int, inChannel: Int){
+      width = inWidth
+      height = inHeight
+      channel = inChannel
     }
+  }
+  public let outputTexture: MTLTexture
+  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.textureType = .type2D
+    textureDesc.width = outputDim.width
+    textureDesc.height = outputDim.height
+    textureDesc.depth = (outputDim.channel + 3) / 4
     
-    public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(inputTexuture, index: 0)
-        encoder.setTexture(outputTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
-        encoder.endEncoding()
+    if computePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if computePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    } else {
+      fatalError()
     }
     
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.storageMode = .shared
+    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+    
+    super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+  }
+  
+  public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(inputTexuture, index: 0)
+    encoder.setTexture(outputTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+    encoder.endEncoding()
+  }
+  
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
index 722ab6b64c953c1fef28082f75794d9e581251ef..939f5db5f192082470ea2ad8773db95af22ffed4 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -18,22 +18,29 @@ struct BoxcoderMetalParam {
 }
 
 class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(param.priorBox.metalTexture, index: 0)
-        encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
-        encoder.setTexture(param.targetBox.metalTexture, index: 2)
-        encoder.setTexture(param.output.metalTexture, index: 3)
-        var bmp = BoxcoderMetalParam.init()
-        encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
-    
-    required init(device: MTLDevice, param: BoxcoderParam<P>) {
-        param.output.initTexture(device: device)
-        super.init(device: device, inFunctionName: "boxcoder")
+    encoder.setTexture(param.priorBox.metalTexture, index: 0)
+    encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
+    encoder.setTexture(param.targetBox.metalTexture, index: 2)
+    encoder.setTexture(param.output.metalTexture, index: 3)
+    var bmp = BoxcoderMetalParam.init()
+    encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: BoxcoderParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "boxcoder")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "boxcoder_half")
+    } else {
+      fatalError()
     }
+  }
+  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
index 60f1437e7fabf0ae088b41f37cc01e2981cbf236..644476ad9dbb471786611fe25a30ed9c4833edbd 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
@@ -121,8 +121,14 @@ class ConcatKernel<P: PrecisionType>: Kernel, Computable{
   }
   
   required init(device: MTLDevice, param: ConcatParam<P>) {
-    param.output.initTexture(device: device, inTranspose: param.transpose)
-    super.init(device: device, inFunctionName: "concat")
+    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "concat")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "concat_half")
+    } else {
+      fatalError()
+    }
   }
   
   required init(device: MTLDevice, testParam: ConcatTestParam) {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index eabadc9d44e7b98fccb0f87e73dd2ffd8da931d7..092207cfb7b9fda63cd6b5aa7082640bae515149 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -15,127 +15,155 @@
 import Foundation
 
 struct ConvAddBatchNormReluTestParam: TestParam {
-    let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
-    var metalParam: MetalConvParam
-    let filterBuffer: MTLBuffer
-    let biaseBuffer: MTLBuffer
-    let newScaleBuffer: MTLBuffer
-    let newBiaseBuffer: MTLBuffer
-    let filterSize: (width: Int, height: Int, channel: Int)
-    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-        inputTexture = inInputTexture
-        outputTexture = inOutputTexture
-        metalParam = inMetalParam
-        filterBuffer = inFilterBuffer
-        biaseBuffer = inBiaseBuffer
-        newScaleBuffer = inNewScaleBuffer
-        newBiaseBuffer = inNewBiaseBuffer
-        filterSize = inFilterSize
-    }
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  var metalParam: MetalConvParam
+  let filterBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
+  let filterSize: (width: Int, height: Int, channel: Int)
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+    inputTexture = inInputTexture
+    outputTexture = inOutputTexture
+    metalParam = inMetalParam
+    filterBuffer = inFilterBuffer
+    biaseBuffer = inBiaseBuffer
+    newScaleBuffer = inNewScaleBuffer
+    newBiaseBuffer = inNewBiaseBuffer
+    filterSize = inFilterSize
+  }
 }
 
 class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-    required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
-        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
-        } else if testParam.filterSize.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
-        } else {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
-        }
+  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+    } else if testParam.filterSize.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
     }
+  }
+  
+  var metalParam: MetalConvParam!
+  
+  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
     
-    var metalParam: MetalConvParam!
-
-    required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
-        
-        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
-        
-        if param.filter.width == 1 && param.filter.height == 1 {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
-        } else if param.filter.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
-        } else {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
-        }
-        
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        param.variance.initBuffer(device: device)
-        param.mean.initBuffer(device: device)
-        param.scale.initBuffer(device: device)
-        param.bias.initBuffer(device: device)
-        
-        
-        let offsetX = param.filter.width/2 - Int(param.paddings[0])
-        let offsetY = param.filter.height/2 - Int(param.paddings[1])
-        
-        print("offset x: \(offsetX)")
-        print("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-        
-        var invs: [P] = []
-        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {            
-            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-            invs.append(P(inv))
-        }
-        
-        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-        
-        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-            newScale[i] = invs[i] * scaleContents[i]
-            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
-        }
-        param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
-        param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
-        
-        newScale.deinitialize(count: param.scale.buffer.length)
-        newScale.deallocate()
-        
-        newBiase.deinitialize(count: param.bias.buffer.length)
-        newBiase.deallocate()
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    
+    if param.filter.width == 1 && param.filter.height == 1 {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+    } else if param.filter.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
     }
     
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    
+    param.y.initBuffer(device: device, precision: computePrecision)
+    
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
+    
+    
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    
+    print("offset x: \(offsetX)")
+    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
+    }
+    
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+    
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+      newScale[i] = invs[i] * scaleContents[i]
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    }
+    
+//    var newScaleFP16: UnsafeMutableRawPointer
+//
+//    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
+    
+    
+//    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
+    
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float16 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float32 {
       
-
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
     }
     
-    public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            fatalError()
-        }
-        
-        encoder.setTexture(param.inputTexture, index: 0)
-        encoder.setTexture(param.outputTexture, index: 1)
-        var inMetalParam = param.metalParam
-        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-        encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
-        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
-        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
-        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-        encoder.endEncoding()
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
+    
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
+    
+    newBiase.deinitialize(count: param.bias.buffer.length)
+    newBiase.deallocate()
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
+    
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
+    }
+    
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
index 83dd4f996ab23a94824deb6194241d6a52ace487..ce1e0f6560e9911e862ead537089d37fdb4fe1c4 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -17,22 +17,35 @@ import Foundation
 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
   var metalParam: MetalConvParam!
   required init(device: MTLDevice, param: ConvAddParam<P>) {
-    if param.filter.width == 1 && param.filter.height == 1 {
-      super.init(device: device, inFunctionName: "conv_add_1x1")
-    } else if param.filter.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
+      } else {
+        super.init(device: device, inFunctionName: "conv_add_3x3_half")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+      } else {
+        super.init(device: device, inFunctionName: "conv_add_3x3")
+      }
     } else {
-      super.init(device: device, inFunctionName: "conv_add_3x3")
+      fatalError()
     }
     
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
     
     let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
     
     let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
 
-    param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-    param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
     
     print("offset x: \(offsetX)")
     print("offset y: \(offsetY)")
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
index c5d3ffe6c944ab9019f5b80e66b4691057209529..350c81cece15a242e1c6b7bb91cf515a4eaf2335 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -51,21 +51,33 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
   var metalParam: MetalConvParam!
   
   required init(device: MTLDevice, param: ConvBNReluParam<P>) {
-    
-    if param.filter.width == 1 && param.filter.height == 1 {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
-    } else if param.filter.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+    if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+      } else {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+      }
+    } else if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
+      } else {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
+      }
     } else {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+      fatalError()
     }
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
-    param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
     
-    param.variance.initBuffer(device: device)
-    param.mean.initBuffer(device: device)
-    param.scale.initBuffer(device: device)
-    param.bias.initBuffer(device: device)
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
     
     let offsetX = param.filter.width/2 - Int(param.paddings[0])
     let offsetY = param.filter.height/2 - Int(param.paddings[1])
@@ -102,8 +114,26 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
       newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
     }
     
-    param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
-    param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float16 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float32 {
+      
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
+    }
+    
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
     
     newScale.deinitialize(count: param.scale.buffer.length)
     newScale.deallocate()
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
index 680beba1ea711b389dd6117fc84f00b6079c9a60..e0485851fd610781f475eb43be1ce6fd4937a4ef 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -39,7 +39,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
     let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
     let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
     let offsetZ = 0.0
-    param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+    param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
     
     metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
   }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
index 5e8d92054bd2fa15af2d3e75860c0dc4d9b93e5c..b6db7231e83943dbce6f2cbe3266af9fbe508aef 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -28,7 +28,7 @@ class PoolKernel<P: PrecisionType>: Kernel, Computable{
   
   required init(device: MTLDevice, param: PoolParam<P>) {
     super.init(device: device, inFunctionName: "pool")
-    param.output.initTexture(device: device, inTranspose: param.input.transpose)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
   }
   
   func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
index ad925eb174414ed8f48cc8dd5bf090bc2ed0aed2..1545a848dacb4f11a2a68df31f7ea49a23799a87 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
@@ -17,8 +17,8 @@ class PreluKernel<P: PrecisionType>: Kernel, Computable{
     } else {
       super.init(device: device, inFunctionName: "prelu_other")
     }
-    param.alpha.initBuffer(device: device)
-    param.output.initTexture(device: device, inTranspose: param.input.transpose)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
   }
   
   func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
index e2363e44d3a3d81b430f82303b2b1017ddfc5200..08a489ab2298c937f8878af94b557c2fa60d18d0 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -33,11 +33,16 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
   var metalParam: PriorBoxMetalParam!
   
   required init(device: MTLDevice, param: PriorBoxParam<P>) {
-    super.init(device: device, inFunctionName: "prior_box")
-    param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3])
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "prior_box")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "prior_box_half")
+    } else {
+      fatalError()
+    }
     
-
-    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3])
+    param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
     
     let n = 1
     let h = param.output.dim[1]
@@ -79,7 +84,18 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
       }
     }
     
-    param.newAspectRatios = outputAspectRatior
+    if computePrecision == .Float16 {
+      let buffer = device.makeBuffer(length: outputAspectRatior.count)
+      float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
+      param.newAspectRatios = buffer
+
+    } else if computePrecision == .Float32 {
+      let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
+      param.newAspectRatios = buffer
+    } else {
+      fatalError()
+    }
+    
     let aspectRatiosSize = uint(outputAspectRatior.count)
     
     let maxSizeSize: uint = uint(param.maxSizes.count)
@@ -102,12 +118,13 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
     encoder.setTexture(param.input.metalTexture, index: 0)
     encoder.setTexture(param.output.metalTexture, index: 1)
     encoder.setTexture(param.outputVariances.metalTexture, index: 2)
-    encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 0)
-    encoder.setBytes(param.newAspectRatios!, length: MemoryLayout<Float32>.size * param.newAspectRatios!.count, index: 1)
+    
+    encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
+    
+    encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
+    
     encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
     encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
     encoder.endEncoding()
   }
-  
-  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
index 96a1abb6df964ee24d74ca9979ca59512f4e4265..3916c07ce5e8d4f3179a8a3100563a77e68eb53b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -15,58 +15,65 @@
 import Foundation
 
 struct ReshapeMetalParam {
-    var idim: (Int32, Int32, Int32, Int32)
-    var itrans: (Int32, Int32, Int32, Int32)
-    var odim: (Int32, Int32, Int32, Int32)
-    var otrans: (Int32, Int32, Int32, Int32)
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
 }
 
 struct ReshapeTestParam: TestParam {
-    let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
-    let param: ReshapeMetalParam
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  let param: ReshapeMetalParam
 }
 
 class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
-    required init(device: MTLDevice, param: ReshapeParam<P>) {
-        param.output.initTexture(device: device)
-        super.init(device: device, inFunctionName: "reshape")
+  required init(device: MTLDevice, param: ReshapeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_half")
+    } else {
+      fatalError()
     }
-    
-    required init(device: MTLDevice, testParam: ReshapeTestParam) {
-        super.init(device: device, inFunctionName: "reshape")
-    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        let id: [Int32] = (0..<4).map { Int32(param.input.dim[$0]) }
-        let it: [Int32] = param.input.transpose.map { Int32($0) }
-        let od: [Int32] = (0..<4).map { Int32(param.output.dim[$0]) }
-        let ot: [Int32] = param.output.transpose.map { Int32($0) }
-        var rmp = ReshapeMetalParam.init(
-            idim: (id[0], id[1], id[2], id[3]),
-            itrans: (it[0], it[1], it[2], it[3]),
-            odim: (od[0], od[1], od[2], od[3]),
-            otrans: (ot[0], ot[1], ot[2], ot[3])
-        )
-        encoder.setBytes(&rmp, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, testParam: ReshapeTestParam) {
+    super.init(device: device, inFunctionName: "reshape")
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
     }
     
-    func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            fatalError()
-        }
-        encoder.setTexture(testParam.inputTexture, index: 0)
-        encoder.setTexture(testParam.outputTexture, index: 1)
-        var pm: ReshapeMetalParam = testParam.param
-        encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-        encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
-        encoder.endEncoding()
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    let id: [Int32] = (0..<4).map { Int32(param.input.dim[$0]) }
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    let od: [Int32] = (0..<4).map { Int32(param.output.dim[$0]) }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    var rmp = ReshapeMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    encoder.setBytes(&rmp, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
     }
+    encoder.setTexture(testParam.inputTexture, index: 0)
+    encoder.setTexture(testParam.outputTexture, index: 1)
+    var pm: ReshapeMetalParam = testParam.param
+    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+    encoder.endEncoding()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
index 5d2d5b1c7af5d9822394d2e7de9b251085c035dc..6f6d0af477f62d7f438b8b6a38c825c2eb95163f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -38,7 +38,13 @@ class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
   }
   
   required init(device: MTLDevice, param: SoftmaxParam<P>) {
-    param.output.initTexture(device: device)
-    super.init(device: device, inFunctionName: "softmax")
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "softmax")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "softmax_half")
+    } else {
+      fatalError()
+    }
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
index 33e1219b4d0fff972d8db3d16fc7ce1477841351..6594b3474f0abb04364246830f79302f487af499 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -41,33 +41,27 @@ struct TransposeTestParam: TestParam {
 }
 
 class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
-  var metalParam: TransposeMetalParam!
-  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-  
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
   
   required init(device: MTLDevice, param: TransposeParam<P>) {
-    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3])
-    super.init(device: device, inFunctionName: "transpose")
+    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
     
+    if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "transpose_half")
+    } else if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "transpose")
+    } else {
+      fatalError()
+    }
     var invT: [Int] = [0, 1, 2, 3]
     for (i, v) in param.input.transpose.enumerated() {
       invT[v] = i
     }
     var axis: [Int] = [0, 1, 2, 3]
     
-//    var doNothing = false
-//    if param.axis.count == param.input.transpose.count {
-//      doNothing = param.axis == param.input.transpose.map { Int32($0) }
-//    }
+    //    var doNothing = false
+    //    if param.axis.count == param.input.transpose.count {
+    //      doNothing = param.axis == param.input.transpose.map { Int32($0) }
+    //    }
     
     
     for i in 0..<param.axis.count {
@@ -84,10 +78,30 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
     }
     metalParam = tmp
   }
+  
   required init(device: MTLDevice, testParam: TransposeTestParam) {
-    super.init(device: device, inFunctionName: "transpose")
-    fatalError()
+    if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "transpose_half")
+    } else if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "transpose")
+    } else {
+      fatalError()
+    }
+  }
+  
+  var metalParam: TransposeMetalParam!
+  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+  
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
   }
+
   
   public func test(commandBuffer: MTLCommandBuffer, param: TransposeTestParam) {
     guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
index 9a177488861c42740f3a0343b3cb41bb0b969137..7abc17ec6e7a204af4d74b28d40e2a4c69dddc4b 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
@@ -34,7 +34,6 @@ kernel void boxcoder(texture2d_array<float, access::read> priorBox [[texture(0)]
   float tw = exp(pv.z * t.z) * pw;
   float th = exp(pv.w * t.w) * ph;
   
-  
   float4 r;
   r.x = tx - tw / 2;
   r.y = ty - th / 2;
@@ -43,3 +42,31 @@ kernel void boxcoder(texture2d_array<float, access::read> priorBox [[texture(0)]
 
   output.write(r, gid.xy, gid.z);
 }
+
+kernel void boxcoder_half(texture2d_array<half, access::read> priorBox [[texture(0)]],
+                     texture2d_array<half, access::read> priorBoxVar [[texture(1)]],
+                     texture2d_array<half, access::read> targetBox [[texture(2)]],
+                     texture2d_array<half, access::write> output[[texture(3)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  half4 t = targetBox.read(gid.xy, gid.z);
+  half4 p = priorBox.read(gid.xy, gid.z);
+  half4 pv = priorBoxVar.read(gid.xy, gid.z);
+  
+  float px = (float(p.x) + float(p.z)) / 2;
+  float py = (float(p.y) + float(p.w)) / 2;
+  float pw = float(p.z) - float(p.x);
+  float ph = float(p.w) - float(p.y);
+  
+  float tx = float(pv.x) * float(t.x) * pw + px;
+  float ty = float(pv.y) * float(t.y) * ph + py;
+  float tw = exp(float(pv.z) * float(t.z)) * pw;
+  float th = exp(float(pv.w) * float(t.w)) * ph;
+  
+  float4 r;
+  r.x = tx - tw / 2;
+  r.y = ty - th / 2;
+  r.z = tx + tw / 2;
+  r.w = ty + th / 2;
+  
+  output.write(half4(r), gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
index c5c4ffc5c995500503411148db31b2acfa3459b6..d37be42be64f8fdd7325fd62a68e646737b6dedf 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
@@ -57,3 +57,14 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
     opos[trans[i]] = ipos[i];
   }
 }
+
+
+struct MetalConvParam {
+  short offsetX;
+  short offsetY;
+  short offsetZ;
+  ushort strideX;
+  ushort strideY;
+  ushort dilationX;
+  ushort dilationY;
+};
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
index 09c0e8dadab759bbdf514f347eff3eb005bfac2f..92d80c315e0d5ca19711b4a2165c89077979d49d 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
@@ -69,3 +69,48 @@ kernel void concat(texture2d_array<float, access::read> in0 [[texture(0)]],
   }
   out.write(r, gid.xy, gid.z);
 }
+
+kernel void concat_half(texture2d_array<half, access::read> in0 [[texture(0)]],
+                   texture2d_array<half, access::read> in1 [[texture(1)]],
+                   texture2d_array<half, access::read> in2 [[texture(2)]],
+                   texture2d_array<half, access::read> in3 [[texture(3)]],
+                   texture2d_array<half, access::read> in4 [[texture(4)]],
+                   texture2d_array<half, access::read> in5 [[texture(5)]],
+                   texture2d_array<half, access::read> inx [[texture(6)]],
+                   texture2d_array<half, access::write> out [[texture(7)]],
+                   constant ConcatParam & pm [[buffer(0)]],
+                   uint3 gid [[thread_position_in_grid]]) {
+  ConcatParam cp = pm;
+  int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+  half4 r;
+  for (int i = 0; i < 4; i++) {
+    xyzn[3] = i;
+    xyzn2abcd(cp.odim[3], xyzn, abcd);
+    int k = abcd[cp.axis] - cp.offset;
+    int j = 0;
+    if (k < 0) {
+      r[i] = inx.read(gid.xy, gid.z)[i];
+    } else {
+      for (; j < 6; j++) {
+        if (k < cp.vdim[j]) {
+          break;
+        }
+        k -= cp.vdim[j];
+      }
+      int ta = cp.odim[cp.axis];
+      abcd[cp.axis] = k;
+      cp.odim[cp.axis] = cp.vdim[j];
+      abcd2xyzn(cp.odim[3], abcd, oxyzn);
+      cp.odim[cp.axis] = ta;
+      switch (j) {
+        case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+        case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+        case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+        case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+        case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+        case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+      }
+    }
+  }
+  out.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..ffa66212b16bb6c6180910cae2d0c34f8659c556
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
@@ -0,0 +1,308 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+
+kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device float4 *new_scale [[buffer(3)]],
+                                              const device float4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = half4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  
+  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device float4 *new_scale [[buffer(3)]],
+                                              const device float4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = half4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                                        constant MetalConvParam &param [[buffer(0)]],
+                                                        const device half *weights [[buffer(1)]],
+                                                        const device half4 *biase [[buffer(2)]],
+                                                        const device float4 *new_scale [[buffer(3)]],
+                                                        const device float4 *new_biase [[buffer(4)]],
+                                                        uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  half4 output = half4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+/*---------------------------------------------*/
+
+
+
+kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device float *weights [[buffer(1)]],
+                                                   const device float4 *biase [[buffer(2)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
new file mode 100644
index 0000000000000000000000000000000000000000..9244b2ec4631015ffd192567f734bee4cc1c7c85
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
@@ -0,0 +1,306 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - convAdd
+kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         const device float4 *new_scale [[buffer(3)]],
+                         const device float4 *new_biase [[buffer(4)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device float *weights [[buffer(1)]],
+                                   const device float4 *biase [[buffer(2)]],
+                                   const device float4 *new_scale [[buffer(3)]],
+                                   const device float4 *new_biase [[buffer(4)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+#pragma mark - half
+
+kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = output + float4(biase[gid.z]);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         const device half4 *new_scale [[buffer(3)]],
+                         const device half4 *new_biase [[buffer(4)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  output = output + float4(biase[gid.z]);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device half *weights [[buffer(1)]],
+                                   const device half4 *biase [[buffer(2)]],
+                                   const device half4 *new_scale [[buffer(3)]],
+                                   const device half4 *new_biase [[buffer(4)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = output + float4(biase[gid.z]);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4b97b7829a1fba27704fe7b60a03b2672f4f5953
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
@@ -0,0 +1,297 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - conv bn relu
+kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device float *weights [[buffer(1)]],
+                                               const device float4 *new_scale [[buffer(2)]],
+                                               const device float4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#pragma mark - half
+kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device half *weights [[buffer(1)]],
+                                               const device half4 *new_scale [[buffer(2)]],
+                                               const device half4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
index a17366cbe0e67b4cd38cafe4e09909f537d269bf..c8a2da5407742cb71cb16d118baa2b446bb30334 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -13,349 +13,9 @@
  limitations under the License. */
 
 #include <metal_stdlib>
+#include "Common.metal"
 using namespace metal;
 
-struct MetalConvParam {
-  short offsetX;
-  short offsetY;
-  short offsetZ;
-  ushort strideX;
-  ushort strideY;
-  ushort dilationX;
-  ushort dilationY;
-};
-
-kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                              constant MetalConvParam &param [[buffer(0)]],
-                                              const device half4 *weights [[buffer(1)]],
-                                              const device half4 *biase [[buffer(2)]],
-                                              const device float4 *new_scale [[buffer(3)]],
-                                              const device float4 *new_biase [[buffer(4)]],
-                                              uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = half4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                              constant MetalConvParam &param [[buffer(0)]],
-                                              const device half4 *weights [[buffer(1)]],
-                                              const device half4 *biase [[buffer(2)]],
-                                              const device float4 *new_scale [[buffer(3)]],
-                                              const device float4 *new_biase [[buffer(4)]],
-                                              uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = half4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                              texture2d_array<half, access::write> outTexture [[texture(1)]],
-                              constant MetalConvParam &param [[buffer(0)]],
-                              const device half4 *weights [[buffer(1)]],
-                              const device half4 *biase [[buffer(2)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = half4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                                        constant MetalConvParam &param [[buffer(0)]],
-                                                        const device half *weights [[buffer(1)]],
-                                                        const device half4 *biase [[buffer(2)]],
-                                                        const device float4 *new_scale [[buffer(3)]],
-                                                        const device float4 *new_biase [[buffer(4)]],
-                                                        uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  half4 output = half4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
-/*---------------------------------------------*/
-
-
-
-kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device float *weights [[buffer(1)]],
-                                                   const device float4 *biase [[buffer(2)]],
-                                                   const device float4 *new_scale [[buffer(3)]],
-                                                   const device float4 *new_biase [[buffer(4)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
 // conv
 #pragma mark -- conv
 kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -487,285 +147,4 @@ kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(
   outTexture.write(output, gid.xy, gid.z);
 }
 
-#pragma mark - convAdd
-kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         const device float4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         const device float4 *biase [[buffer(2)]],
-                         const device float4 *new_scale [[buffer(3)]],
-                         const device float4 *new_biase [[buffer(4)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                   constant MetalConvParam &param [[buffer(0)]],
-                                   const device float *weights [[buffer(1)]],
-                                   const device float4 *biase [[buffer(2)]],
-                                   const device float4 *new_scale [[buffer(3)]],
-                                   const device float4 *new_biase [[buffer(4)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-#pragma mark - conv bn relu
-kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device float4 *weights [[buffer(1)]],
-                                     const device float4 *new_scale [[buffer(2)]],
-                                     const device float4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device float4 *weights [[buffer(1)]],
-                                     const device float4 *new_scale [[buffer(2)]],
-                                     const device float4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
-    }
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                               constant MetalConvParam &param [[buffer(0)]],
-                                               const device float *weights [[buffer(1)]],
-                                               const device float4 *new_scale [[buffer(2)]],
-                                               const device float4 *new_biase [[buffer(3)]],
-                                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
-}
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
index a45063b3c09873b7c41f7540d1d33df2b1559e54..8bce97f21efe159489e2bc65e598c3252b567b79 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
@@ -79,113 +79,6 @@ kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [
   outTexture.write(input, gid.xy, 0);
 }
 
-struct PoolParam {
-  int ksizeX;
-  int ksizeY;
-  int strideX;
-  int strideY;
-  int paddingX;
-  int paddingY;
-  int poolType;
-};
-
-kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                 texture2d_array<float, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  int xmin = gid.x * pm.strideX - pm.paddingX;
-  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-  xmin = max(xmin, 0);
-  int ymin = gid.y * pm.strideX - pm.paddingX;
-  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-  ymin = max(ymin, 0);
-  
-  float4 r = 0;
-  if (pm.poolType == 0) {
-    r = inTexture.read(uint2(xmin, ymin), gid.z);
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-      }
-    }
-  } else if (pm.poolType == 1) {
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r += inTexture.read(uint2(x, y), gid.z);
-      }
-    }
-    r /= pm.ksizeX * pm.ksizeY;
-  }
-  outTexture.write(r, gid.xy, gid.z);
-}
-
-
-kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      constant PoolParam &pm [[buffer(0)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  int xmin = gid.x * pm.strideX - pm.paddingX;
-  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-  xmin = max(xmin, 0);
-  int ymin = gid.y * pm.strideX - pm.paddingX;
-  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-  ymin = max(ymin, 0);
-  
-  half4 r = 0;
-  if (pm.poolType == 0) {
-    r = inTexture.read(uint2(xmin, ymin), gid.z);
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-      }
-    }
-  } else if (pm.poolType == 1) {
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r += inTexture.read(uint2(x, y), gid.z);
-      }
-    }
-    r /= pm.ksizeX * pm.ksizeY;
-  }
-  outTexture.write(r, gid.xy, gid.z);
-}
 
-struct TransposeParam {
-  int iC;
-  int oC;
-  int axis[4];
-};
 
-kernel void transpose(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      constant TransposeParam &pm [[buffer(0)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  
 
-  if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) {
-    // do nothing
-    float4 r = inTexture.read(gid.xy, gid.z);
-    outTexture.write(r, gid.xy, gid.z);
-  } else {
-    float4 r;
-    for (int n = 0; n < 4; n++) {
-      int ixyzn[] = {int(gid.x), int(gid.y), int(gid.z), n};
-      int iabcd[4], oabcd[4], oxyzn[4];
-      xyzn2abcd(pm.oC, ixyzn, iabcd);
-      oabcd[pm.axis[0]] = iabcd[0];
-      oabcd[pm.axis[1]] = iabcd[1];
-      oabcd[pm.axis[2]] = iabcd[2];
-      oabcd[pm.axis[3]] = iabcd[3];
-      abcd2xyzn(pm.iC, oabcd, oxyzn);
-      float4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]);
-      r[n] = rt[oxyzn[3]];
-    }
-    outTexture.write(r, gid.xy, gid.z);
-  }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f7240db2ba716090001ed539bddb87dff5117
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct PoolParam {
+  int ksizeX;
+  int ksizeY;
+  int strideX;
+  int strideY;
+  int paddingX;
+  int paddingY;
+  int poolType;
+};
+
+kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                 constant PoolParam &pm [[buffer(0)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  float4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      constant PoolParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  half4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
index 1c5b08ee7eeaaa4fd2a8b5064a6af66c77596120..bd14a146100d0a0723f73ab5fd1f95d1f8e39c97 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
@@ -15,8 +15,6 @@
 #include <metal_stdlib>
 using namespace metal;
 
-
-
 kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                            texture2d_array<float, access::write> outTexture [[texture(1)]],
                            const device float4 *alpha [[buffer(0)]],
@@ -82,3 +80,4 @@ kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[textu
   output.w = input.w > 0 ? input.w : (alpha_value * input.w);
   outTexture.write(output, gid.xy, gid.z);
 }
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
index 6083c6b514a3d8a0918d585a950d915e69a045fe..4107f0e30831bdb4c7ac3c020c0dc8e3a52a94cb 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
@@ -35,8 +35,8 @@ struct PriorBoxMetalParam {
 kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
                       texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
                       texture2d_array<float, access::write> varianceTexture [[texture(2)]],
-                      constant PriorBoxMetalParam &param [[buffer(0)]],
-                      const device float *aspect_ratios [[buffer(1)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
                       const device float4 *variances [[buffer(2)]],
                       uint3 gid [[thread_position_in_grid]]) {
   if (gid.x >= outBoxTexture.get_width() ||
@@ -96,3 +96,68 @@ kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0
   }
 }
 
+
+kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                      constant PriorBoxMetalParam &param [[buffer(0)]],
+                      const device half *aspect_ratios [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    half ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+      
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
index 533f30156d5c1820bb14051d587e9c576c85ea72..399287da71feb11b4e19167ced4f7fe4acdbf42a 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
@@ -102,14 +102,36 @@ kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]
   outTexture.write(r, gid.xy, gid.z);
 }
 
-kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
 
-    half4 r = inTexture.read(uint2(0, 0), gid.x);
-    outTexture.write(r, gid.xy, gid.z);
+kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                    constant ReshapeParam &rp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
+  ReshapeParam lrp = rp;
+  int oC = lrp.odim[lrp.otrans[3]];
+  int iC = lrp.idim[lrp.itrans[3]];
+  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
+  half4 r;
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+    xyzn2abcd(oC, oxyzn, oabcd);
+    int tabcd[4];
+    invtrans(lrp.otrans, oabcd, tabcd);
+    int index = abcd2index(lrp.odim, tabcd);
+    if (index < count) {
+      index2abcd(lrp.idim, index, tabcd);
+      trans(lrp.itrans, tabcd, iabcd);
+      abcd2xyzn(iC, iabcd, ixyzn);
+      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+    } else {
+      r[n] = 0;
+    }
+  }
+  outTexture.write(r, gid.xy, gid.z);
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
index ce70c9e6652f5e0be73bebba2f55877837b0b4a7..3442ba17ceee08d68b1f84642e00c56c5f73a4a2 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
@@ -57,25 +57,44 @@ kernel void softmax(texture2d_array<float, access::read> inTexture [[texture(0)]
   rr = exp(rr - maxv) / sum;
   outTexture.write(rr, gid.xy, gid.z);
 }
-//
-//kernel void softmax_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-//                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-//                         uint3 gid [[thread_position_in_grid]]) {
-//  if (gid.x >= outTexture.get_width() ||
-//      gid.y >= outTexture.get_height() ||
-//      gid.z >= outTexture.get_array_size()) return;
-//  int zsize = inTexture.get_array_size();
-//  half maxv = inTexture.read(uint2(0, 0), 0)[0];
-//  for (int z = 0; z < zsize; z++) {
-//    half4 r = inTexture.read(uint2(0, 0), z);
-//    maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3])));
-//  }
-//  float sum = 0;
-//  for (int z = 0; z < zsize; z++) {
-//    half4 r = inTexture.read(uint2(0, 0), z);
-//    sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv);
-//  }
-//  half4 rr = inTexture.read(gid.xy, gid.z);
-//  rr = exp(rr - maxv) / sum;
-//  outTexture.write(rr, gid.xy, gid.z);
-//}
+
+kernel void softmax_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                    constant SoftmaxParam &sp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  //  int zsize = inTexture.get_array_size();
+  half maxv = inTexture.read(gid.xy, 0)[0];
+  int group = sp.K / 4;
+  int remain = sp.K % 4;
+  for (int z = 0; z < group; z++) {
+    half4 r = inTexture.read(gid.xy, z);
+    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
+  }
+  if (remain > 0) {
+    half4 r = inTexture.read(gid.xy, group);
+    for (int i = 0; i < remain; i++) {
+      maxv = max(maxv, r[i]);
+    }
+  }
+  float4 rsum = {0, 0, 0, 0};
+  for (int z = 0; z < group; z++) {
+    half4 r = inTexture.read(gid.xy, z);
+    rsum += exp(float4(r) - float4(maxv));
+  }
+  
+  float sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
+  if (remain > 0) {
+    half4 r = inTexture.read(gid.xy, group);
+    for (int i = 0; i < remain; i++) {
+      sum += exp(float(r[i]) - float(maxv));
+    }
+  }
+  
+  half4 rr = inTexture.read(gid.xy, gid.z);
+  rr = half4(exp(float4(rr) - float(maxv)) / sum);
+  outTexture.write(rr, gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4801f0315496ea65ff3ff51174e4e8086909aae8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct TransposeParam {
+  int iC;
+  int oC;
+  int axis[4];
+};
+
+kernel void transpose(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      constant TransposeParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  
+  
+  if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) {
+    // do nothing
+    float4 r = inTexture.read(gid.xy, gid.z);
+    outTexture.write(r, gid.xy, gid.z);
+  } else {
+    float4 r;
+    for (int n = 0; n < 4; n++) {
+      int ixyzn[] = {int(gid.x), int(gid.y), int(gid.z), n};
+      int iabcd[4], oabcd[4], oxyzn[4];
+      xyzn2abcd(pm.oC, ixyzn, iabcd);
+      oabcd[pm.axis[0]] = iabcd[0];
+      oabcd[pm.axis[1]] = iabcd[1];
+      oabcd[pm.axis[2]] = iabcd[2];
+      oabcd[pm.axis[3]] = iabcd[3];
+      abcd2xyzn(pm.iC, oabcd, oxyzn);
+      float4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]);
+      r[n] = rt[oxyzn[3]];
+    }
+    outTexture.write(r, gid.xy, gid.z);
+  }
+}
+
+kernel void transpose_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  
+  
+  if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) {
+    // do nothing
+    half4 r = inTexture.read(gid.xy, gid.z);
+    outTexture.write(r, gid.xy, gid.z);
+  } else {
+    half4 r;
+    for (int n = 0; n < 4; n++) {
+      int ixyzn[] = {int(gid.x), int(gid.y), int(gid.z), n};
+      int iabcd[4], oabcd[4], oxyzn[4];
+      xyzn2abcd(pm.oC, ixyzn, iabcd);
+      oabcd[pm.axis[0]] = iabcd[0];
+      oabcd[pm.axis[1]] = iabcd[1];
+      oabcd[pm.axis[2]] = iabcd[2];
+      oabcd[pm.axis[3]] = iabcd[3];
+      abcd2xyzn(pm.iC, oabcd, oxyzn);
+      half4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]);
+      r[n] = rt[oxyzn[3]];
+    }
+    outTexture.write(r, gid.xy, gid.z);
+  }
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
index 6f42f2aa9f8d0515946ace625ed16c5040fd3099..d3d31cfcd43a81f6068238d45f3442af0fc7795f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
@@ -60,7 +60,7 @@ class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable,
   
   func delogOutput() {
     print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
 
     
 //    print("pool2d delog")
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
index 10b5816d7b4528572cdc6b84d53b73499dde93b4..c7e049e3c1b21d9747acca8812abfff8c25d6d98 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
@@ -51,13 +51,13 @@ class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runabl
   
   func delogOutput() {
     print(" \(type) input: ")
-    print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3])).strideArray())
+    print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3]), texturePrecision: computePrecision).strideArray())
     
     print(" \(type) Alpha: ")
     let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
     
     print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray())
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray())
   }
   
 //    print("softmax delog")
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
index 7e82fdec37fb7bd66181fde3af01aedbaf87a023..6999043ab75532e6517398c5bbfe6b893b49cf57 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -39,7 +39,7 @@ class PriorBoxParam<P: PrecisionType>: OpParam {
   let minSizes: [Float32]
   let maxSizes: [Float32]
   let aspectRatios: [Float32]
-  var newAspectRatios: [Float32]?
+  var newAspectRatios: MTLBuffer?
   let variances: [Float32]
   let flip: Bool
   let clip: Bool
@@ -69,15 +69,19 @@ class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>
   }
   
   func delogOutput() {
-    print(" \(type) output: ")
 
     // output
-    let outputArray = para.output.metalTexture.floatArray { (o: Float32) -> Float32 in
-      return o
+    print(" \(type) output: ")
+    let originDim = para.output.originDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray())
+    } else {
+      print(" not implement")
     }
     
-    print(outputArray)
-    
 //    writeToLibrary(fileName: "box_out", array: outputArray)
     
     // output variance
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
index c9f054c88af44ac3f5dd453b4696c7988d01fa8f..0325f860e078cf639c08e279970a105e3f562a32 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
@@ -46,7 +46,7 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable,
   
   func delogOutput() {
     print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
   }
   
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
index 373c448b16d3a597f28884ee2e70b29c152f5526..451b064ce19e0e1cb70700d046b6ab059e6df9e3 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -76,7 +76,7 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
     
     let originDim = para.output.originDim
     
-    let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
     print(outputArray.strideArray())
 
   }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
index 0eddca3eb9ce21982954c28e5484f94eb5cacbe1..af776450d7f15d031a6af546d13bc1c0374249b7 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
@@ -52,9 +52,9 @@ class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>,
   
   func delogOutput() {
     print("softmax delog")
-    
+
     let originDim = para.output.originDim
-    let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
     print(outputArray.strideArray())
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
index 3f05a3db3e0de7abaa208ba8b0700688fe349d4e..dc658793a34f96fb0b1fa88b122a1c8639d50058 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
@@ -49,7 +49,7 @@ class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam
   func delogOutput() {
     print(" \(type) output: ")
     let originDim = para.output.tensorDim
-    let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision)
     print(outputArray.strideArray())
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
index e7953e83490e9f3d2bca8b035193dcb0406feaa7..a318180b2b57b162715f0088fdfd88767506ad2a 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -12,7 +12,6 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-import Accelerate
 import Foundation
 
 protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
@@ -27,10 +26,11 @@ extension Tensorial {
   }
 }
 
+public enum ComputePrecision {
+  case Float32, Float16
+}
+
 class Tensor<P: PrecisionType>: Tensorial {
-  enum BufferPrecision {
-    case Float32, Float16
-  }
   
   var data: Data
   var dim: Dim
@@ -93,15 +93,9 @@ class Tensor<P: PrecisionType>: Tensorial {
     layout = to
   }
   
-  func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
-    var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
-    var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
-    guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
-      fatalError(" float 32 to float 16 error ! ")
-    }
-  }
+
   
-  func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
+  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16) {
     guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
       fatalError(" not support yet ")
     }
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
index 9889171dba3cb8f18b20cac597a6fec505a92e73..42a381c50cd3a66f39d9e19e9cffc8de83ca48b2 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -46,7 +46,7 @@ public class Texture<P: PrecisionType>: Tensorial {
   public var metalTexture: MTLTexture!
   var transpose: [Int] = [0, 1, 2, 3]
   
-  func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3]) {
+  func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
     transpose = inTranspose
     let newDim = transpose.map { originDim[$0] }
     
@@ -64,12 +64,10 @@ public class Texture<P: PrecisionType>: Tensorial {
     tmpTextureDes.depth = 1
     tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
     tmpTextureDes.textureType = .type2DArray
-    
-    if MemoryLayout<P>.size == 1 {
-      tmpTextureDes.pixelFormat = .rgba8Unorm
-    } else if MemoryLayout<P>.size == 2 {
+   
+    if computePrecision == .Float16 {
       tmpTextureDes.pixelFormat = .rgba16Float
-    } else if MemoryLayout<P>.size == 4 {
+    } else if computePrecision == .Float32 {
       tmpTextureDes.pixelFormat = .rgba32Float
     }