From 005115a1957690e935f3b14c19ac1035ca28219e Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Tue, 5 Mar 2019 21:03:47 +0800
Subject: [PATCH] format files, improve accuracy

---
 .../MobileNetDemo/AppDelegate.swift           |   64 +-
 .../MobileNetDemo/MobileNet.swift             |   82 +-
 .../MobileNetDemo/MobilenetPreProcess.metal   |   28 +-
 .../MobileNetDemo/ViewController.swift        |  132 +-
 .../paddle-mobile-demo/AppDelegate.swift      |   18 +-
 .../Base.lproj/Main.storyboard                |    2 +-
 .../paddle-mobile-demo/MetalHelper.swift      |   20 +-
 .../MultiPredictViewController.swift          |   92 +-
 .../paddle-mobile-demo/Net/CPUCompute.mm      |  434 +++----
 .../paddle-mobile-demo/Net/Genet.swift        |   64 +-
 .../paddle-mobile-demo/Net/MobileNet.swift    |   86 +-
 .../Net/MobileNetCombined.swift               |   28 +-
 .../paddle-mobile-demo/Net/MobileNetSSD.swift |  158 +--
 .../Net/MobilenetSSD_AR.swift                 |  256 ++--
 .../Net/PreProcessKernel.metal                |   80 +-
 .../paddle-mobile-demo/Net/YoloNet.swift      |   28 +-
 .../OCDemo/LoadPointerViewController.m        |  136 +-
 .../OCInterface/PaddleMobileGPU.h             |    4 +-
 .../OCInterface/PaddleMobileGPU.m             |   84 +-
 .../OCInterface/SuperResolutionNet.swift      |   96 +-
 .../VideoCapture/FPSCounter.swift             |   42 +-
 .../VideoCapture/VideoCapture.swift           |  112 +-
 .../paddle-mobile-demo/ViewController.swift   |  411 +++---
 .../project.pbxproj                           |    6 +-
 .../BatchNormKernel.metal                     |   32 +-
 .../BatchNormRelu.metal                       |   10 +-
 .../BilinearInterp.inc.metal                  |   46 +-
 .../BilinearInterp.metal                      |    4 +-
 .../paddle-mobile-metallib/BoxCoder.inc.metal |   58 +-
 .../BufferToTexture.metal                     |   28 +-
 .../paddle-mobile-metallib/Common.metal       |  124 +-
 .../ConcatKernel.inc.metal                    |  328 ++---
 .../paddle-mobile-metallib/ConcatKernel.metal |  190 +--
 .../ConvAddBNReluKernel.metal                 |  476 +++----
 .../paddle-mobile-metallib/ConvAddMetal.metal | 1040 +++++++--------
 .../ConvAddPrelu.inc.metal                    |  692 +++++-----
 .../ConvAddPreluKernel.metal                  |   60 +-
 .../ConvBNReluKernel.metal                    |  464 +++----
 .../paddle-mobile-metallib/ConvKernel.metal   |  440 +++----
 .../ConvTransposeKernel.metal                 |  158 +--
 .../paddle-mobile-metallib/Elementwise.metal  |  134 +-
 .../ElementwiseAddPreluKernel.inc.metal       |  104 +-
 .../ElementwiseAddPreluKernel.metal           |   14 +-
 .../FetchKernel.inc.metal                     |   58 +-
 .../paddle-mobile-metallib/FetchKernel.metal  |    2 +-
 .../paddle-mobile-metallib/Kernels.metal      |   52 +-
 .../NMSFetchResultKernel.metal                |  100 +-
 .../PoolKernel.inc.metal                      |   58 +-
 .../paddle-mobile-metallib/PoolKernel.metal   |   14 +-
 .../paddle-mobile-metallib/PreluKernel.metal  |  222 ++--
 .../PriorBoxKernel.metal                      |  584 ++++-----
 .../paddle-mobile-metallib/ReluKernel.metal   |   36 +-
 .../ReshapeKernel.inc.metal                   |   60 +-
 .../ReshapeKernel.metal                       |    8 +-
 .../ResizeBilinear.metal                      |  100 +-
 .../paddle-mobile-metallib/Scale.metal        |   28 +-
 .../paddle-mobile-metallib/Softmax.inc.metal  |   66 +-
 .../paddle-mobile-metallib/Softmax.metal      |    4 +-
 .../paddle-mobile-metallib/Split.inc.metal    |   98 +-
 .../paddle-mobile-metallib/Split.metal        |   50 +-
 .../TransposeKernel.inc.metal                 |   42 +-
 .../TransposeKernel.metal                     |   58 +-
 .../paddle-mobile-unit-test/AppDelegate.swift |   24 +-
 .../ViewController.swift                      |   12 +-
 .../paddle-mobile.xcodeproj/project.pbxproj   |    4 +-
 .../paddle-mobile/API/GlobalConfig.swift      |   30 +-
 .../paddle-mobile/paddle-mobile/API/Net.swift |  136 +-
 .../paddle-mobile/API/Runner.swift            |  322 ++---
 .../paddle-mobile/Src/Common/Extensions.swift |  156 +--
 .../Src/Common/MetalExtension.swift           | 1122 ++++++++---------
 .../Src/Common/PaddleMobileUnitTest.swift     |  376 +++---
 .../paddle-mobile/Src/Common/Types.swift      |  414 +++---
 .../paddle-mobile/Src/Framework/Dim.swift     |   74 +-
 .../Src/Framework/Executor.swift              |  233 ++--
 .../paddle-mobile/Src/Framework/Loader.swift  |  464 +++----
 .../paddle-mobile/Src/Framework/Tensor.swift  |  586 ++++-----
 .../paddle-mobile/Src/Framework/Texture.swift |  288 ++---
 .../Src/Operators/Base/OpCreator.swift        |    4 +-
 .../Src/Operators/Base/OpParam.swift          |  376 +++---
 .../Src/Operators/Base/Operator.swift         |  180 +--
 .../Src/Operators/BatchNormOp.swift           |   86 +-
 .../Src/Operators/BilinearInterpOp.swift      |   78 +-
 .../Src/Operators/BoxcoderOp.swift            |  116 +-
 .../Src/Operators/ConcatOp.swift              |   98 +-
 .../Src/Operators/ConvAddAddPreluOp.swift     |  166 +--
 .../Operators/ConvAddBatchNormReluOp.swift    |  202 +--
 .../Src/Operators/ConvAddOp.swift             |  180 +--
 .../Src/Operators/ConvAddPreluOp.swift        |  152 +--
 .../Src/Operators/ConvBNReluOp.swift          |  180 +--
 .../paddle-mobile/Src/Operators/ConvOp.swift  |  112 +-
 .../Src/Operators/ConvTransposeOp.swift       |   66 +-
 .../Src/Operators/DepthwiseConvOp.swift       |   68 +-
 .../Src/Operators/DwConvBNReluOp.swift        |   98 +-
 .../Src/Operators/ElementwiseAddOp.swift      |  132 +-
 .../Src/Operators/ElementwiseAddPreluOp.swift |  172 +--
 .../paddle-mobile/Src/Operators/FeedOp.swift  |   86 +-
 .../paddle-mobile/Src/Operators/FetchOp.swift |   66 +-
 .../Src/Operators/FlattenOp.swift             |   66 +-
 .../Src/Operators/Kernels/Base/Kernel.swift   |  184 +--
 .../Operators/Kernels/BatchNormKernel.swift   |   66 +-
 .../Kernels/BilinearInterpKernel.swift        |   62 +-
 .../Operators/Kernels/BoxcoderKernel.swift    |   46 +-
 .../Src/Operators/Kernels/ConcatKernel.swift  |  228 ++--
 .../Kernels/ConvAddAddPreluKernel.swift       |  248 ++--
 .../Kernels/ConvAddBatchNormReluKernel.swift  |  302 ++---
 .../Src/Operators/Kernels/ConvAddKernel.swift |  134 +-
 .../Kernels/ConvAddPreluKernel.swift          |  248 ++--
 .../Operators/Kernels/ConvBNReluKernel.swift  |  302 ++---
 .../Src/Operators/Kernels/ConvKernel.swift    |   74 +-
 .../Kernels/ConvTransposeKernel.swift         |  114 +-
 .../Kernels/ElementwiseAddKernel.swift        |  100 +-
 .../Kernels/ElementwiseAddPreluKernel.swift   |  114 +-
 .../Src/Operators/Kernels/FetchKernel.swift   |   80 +-
 .../Src/Operators/Kernels/FlattenKernel.swift |   94 +-
 .../Kernels/MulticlassNMSKernel.swift         |   66 +-
 .../Src/Operators/Kernels/PoolKernel.swift    |   94 +-
 .../Src/Operators/Kernels/PreluKernel.swift   |   66 +-
 .../Operators/Kernels/PriorBoxKernel.swift    |  250 ++--
 .../Src/Operators/Kernels/ReluKernel.swift    |   34 +-
 .../Src/Operators/Kernels/ReshapeKernel.swift |  140 +-
 .../Kernels/ResizeBilinearKernel.swift        |   54 +-
 .../Src/Operators/Kernels/Scale.swift         |   16 +-
 .../Src/Operators/Kernels/ShapeKernel.swift   |   38 +-
 .../Src/Operators/Kernels/SoftmaxKernel.swift |   56 +-
 .../Src/Operators/Kernels/SplitKernel.swift   |  140 +-
 .../Kernels/Texture2DTo2DArrayKernel.swift    |   44 +-
 .../Operators/Kernels/TransposeKernel.swift   |  110 +-
 .../Src/Operators/MulticlassNMSOp.swift       |   90 +-
 .../paddle-mobile/Src/Operators/PoolOp.swift  |  100 +-
 .../paddle-mobile/Src/Operators/PreluOp.swift |   80 +-
 .../Src/Operators/PriorBoxOp.swift            |  188 +--
 .../paddle-mobile/Src/Operators/ReluOp.swift  |   66 +-
 .../Src/Operators/ReshapeOp.swift             |  106 +-
 .../Src/Operators/ResizeBilinearOp.swift      |   76 +-
 .../paddle-mobile/Src/Operators/ShapeOp.swift |   56 +-
 .../Src/Operators/SoftmaxOp.swift             |   76 +-
 .../paddle-mobile/Src/Operators/SplitOp.swift |  100 +-
 .../Src/Operators/TransposeOp.swift           |   64 +-
 .../Src/Program/PMBlockDesc.swift             |   10 +-
 .../paddle-mobile/Src/Program/PMOpDesc.swift  |   50 +-
 .../paddle-mobile/Src/Program/PMVarDesc.swift |    4 +-
 .../Src/Program/ProgramOptimize.swift         |  468 +++----
 .../paddle-mobile/Src/Program/Scope.swift     |    2 +-
 143 files changed, 10224 insertions(+), 10246 deletions(-)

diff --git a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
index 4152b9be89..9596c1a535 100644
--- a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
@@ -10,37 +10,37 @@ import UIKit
 
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
-
-  var window: UIWindow?
-
-
-  func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
-    // Override point for customization after application launch.
-    return true
-  }
-
-  func applicationWillResignActive(_ application: UIApplication) {
-    // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
-    // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
-  }
-
-  func applicationDidEnterBackground(_ application: UIApplication) {
-    // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
-    // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
-  }
-
-  func applicationWillEnterForeground(_ application: UIApplication) {
-    // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
-  }
-
-  func applicationDidBecomeActive(_ application: UIApplication) {
-    // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
-  }
-
-  func applicationWillTerminate(_ application: UIApplication) {
-    // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
-  }
-
-
+    
+    var window: UIWindow?
+    
+    
+    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
+        // Override point for customization after application launch.
+        return true
+    }
+    
+    func applicationWillResignActive(_ application: UIApplication) {
+        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
+        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
+    }
+    
+    func applicationDidEnterBackground(_ application: UIApplication) {
+        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
+        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
+    }
+    
+    func applicationWillEnterForeground(_ application: UIApplication) {
+        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
+    }
+    
+    func applicationDidBecomeActive(_ application: UIApplication) {
+        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
+    }
+    
+    func applicationWillTerminate(_ application: UIApplication) {
+        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
+    }
+    
+    
 }
 
diff --git a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
index f0902855cc..7f26427f2b 100644
--- a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
@@ -16,51 +16,51 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet: Net{
-  class MobilenetPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    class MobilenetPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
     }
-  }
-  
-  class PreWords {
-    var contents: [String] = []
-    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-        let string = try! String.init(contentsOfFile: filePath)
-        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+    
+    class PreWords {
+        var contents: [String] = []
+        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+                let string = try! String.init(contentsOfFile: filePath)
+                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
+                }
+            }else{
+                fatalError("no file call \(fileName)")
+            }
+        }
+        subscript(index: Int) -> String {
+            return contents[index]
         }
-      }else{
-        fatalError("no file call \(fileName)")
-      }
     }
-    subscript(index: Int) -> String {
-      return contents[index]
+    
+    let labels = PreWords.init(fileName: "synset")
+    
+    override public func resultStr(res: [ResultHolder]) -> String {
+        let firstRes = res[0]
+        let resPointer = firstRes.result
+        var s: [String] = []
+        (0..<firstRes.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+        }
+        return s.joined(separator: "\n")
     }
-  }
-  
-  let labels = PreWords.init(fileName: "synset")
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    let firstRes = res[0]
-    let resPointer = firstRes.result
-    var s: [String] = []
-    (0..<firstRes.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
-      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    
+    override public init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"
+        preprocessKernel = MobilenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
     }
-    return s.joined(separator: "\n")
-  }
-  
-  override public init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"
-    preprocessKernel = MobilenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
 }
 
diff --git a/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal b/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
index c7db4187c1..2da78ec4c1 100644
--- a/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
+++ b/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
@@ -14,13 +14,13 @@ kernel void mobilenet_preprocess(
                                  texture2d<float, access::write> outTexture [[texture(1)]],
                                  uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void mobilenet_preprocess_half(
@@ -28,11 +28,11 @@ kernel void mobilenet_preprocess_half(
                                       texture2d<half, access::write> outTexture [[texture(1)]],
                                       uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
-  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
diff --git a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
index 4e31282f03..a0d69c5c06 100644
--- a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
@@ -10,84 +10,84 @@ import UIKit
 import paddle_mobile
 
 class ViewController: UIViewController {
-  @IBOutlet weak var resultTextView: UITextView!
-  @IBOutlet weak var selectImageView: UIImageView!
-  @IBOutlet weak var elapsedTimeLabel: UILabel!
-  var net: MobileNet!
-  var runner: Runner!
-  var toPredictTexture: MTLTexture?
-  
-  override func viewDidLoad() {
-    super.viewDidLoad()
-    GlobalConfig.shared.computePrecision = .Float16
-    net = MobileNet.init(device: MetalHelper.shared.device)
-    runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue)
+    @IBOutlet weak var resultTextView: UITextView!
+    @IBOutlet weak var selectImageView: UIImageView!
+    @IBOutlet weak var elapsedTimeLabel: UILabel!
+    var net: MobileNet!
+    var runner: Runner!
+    var toPredictTexture: MTLTexture?
     
-    if let selectImage = UIImage.init(named: "banana.jpeg") {
-      selectImageView.image = selectImage
-      runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in
-        self?.toPredictTexture = texture
-      }
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        GlobalConfig.shared.computePrecision = .Float16
+        net = MobileNet.init(device: MetalHelper.shared.device)
+        runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue)
+        
+        if let selectImage = UIImage.init(named: "banana.jpeg") {
+            selectImageView.image = selectImage
+            runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in
+                self?.toPredictTexture = texture
+            }
+        }
+        
+    }
+    
+    @IBAction func loadAct(_ sender: Any) {
+        if runner.load() {
+            let resutText = " load success ! "
+            print(resutText)
+            self.resultTextView.text = resutText
+        } else {
+            fatalError(" load error ")
+        }
+    }
+    
+    @IBAction func selectImageAct(_ sender: Any) {
+        let imagePicker = UIImagePickerController()
+        imagePicker.sourceType = .camera
+        imagePicker.delegate = self
+        self.present(imagePicker, animated: true, completion: nil)
     }
     
-  }
-  
-  @IBAction func loadAct(_ sender: Any) {
-    if runner.load() {
-      let resutText = " load success ! "
-      print(resutText)
-      self.resultTextView.text = resutText
-    } else {
-      fatalError(" load error ")
+    @IBAction func clearAct(_ sender: Any) {
+        runner.clear()
     }
-  }
-  
-  @IBAction func selectImageAct(_ sender: Any) {
-    let imagePicker = UIImagePickerController()
-    imagePicker.sourceType = .camera
-    imagePicker.delegate = self
-    self.present(imagePicker, animated: true, completion: nil)
-  }
-  
-  @IBAction func clearAct(_ sender: Any) {
-    runner.clear()
-  }
-  
-  @IBAction func predictAct(_ sender: Any) {
     
-    if let texture = toPredictTexture {
-      let beginDate = Date.init()
-      runner.predict(texture: texture) { [weak self] (success, resultHolder) in
-        if success, let inResultHolder = resultHolder {
-          let timeUse = Date.init().timeIntervalSince(beginDate)
-          DispatchQueue.main.async {
-            self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms"
-            self?.resultTextView.text = self?.net.resultStr(res: inResultHolder)
-          }
-          
+    @IBAction func predictAct(_ sender: Any) {
+        
+        if let texture = toPredictTexture {
+            let beginDate = Date.init()
+            runner.predict(texture: texture) { [weak self] (success, resultHolder) in
+                if success, let inResultHolder = resultHolder {
+                    let timeUse = Date.init().timeIntervalSince(beginDate)
+                    DispatchQueue.main.async {
+                        self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms"
+                        self?.resultTextView.text = self?.net.resultStr(res: inResultHolder)
+                    }
+                    
+                } else {
+                    print(" predict fail ")
+                }
+            }
         } else {
-          print(" predict fail ")
+            print(" toPredictTexture is nil ")
         }
-      }
-    } else {
-      print(" toPredictTexture is nil ")
+        
     }
     
-  }
-  
 }
 
 extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-    picker.dismiss(animated: true){[weak self] in
-      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else {
-        fatalError("no image")
-      }
-      sSelf.selectImageView.image = image
-      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
-        sSelf.toPredictTexture = texture
-      })
+    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+        picker.dismiss(animated: true){[weak self] in
+            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else {
+                fatalError("no image")
+            }
+            sSelf.selectImageView.image = image
+            sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+                sSelf.toPredictTexture = texture
+            })
+        }
     }
-  }
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
index 537fb06ed9..557f5eef35 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
@@ -16,36 +16,36 @@ import UIKit
 
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
-
+    
     var window: UIWindow?
-
+    
     func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
         // Override point for customization after application launch.
         return true
     }
-
+    
     func applicationWillResignActive(_ application: UIApplication) {
         // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
         // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
     }
-
+    
     func applicationDidEnterBackground(_ application: UIApplication) {
         // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
         // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
     }
-
+    
     func applicationWillEnterForeground(_ application: UIApplication) {
         // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
     }
-
+    
     func applicationDidBecomeActive(_ application: UIApplication) {
         // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
     }
-
+    
     func applicationWillTerminate(_ application: UIApplication) {
         // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
     }
-
-
+    
+    
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
index 88445bfdb4..d67403f272 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="4MS-jc-i6A">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
     <device id="retina4_7" orientation="portrait">
         <adaptation id="fullscreen"/>
     </device>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
index ca19c166c3..8252258c97 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
@@ -18,14 +18,14 @@ import Foundation
 import paddle_mobile
 
 @objc public class MetalHelper: NSObject {
-  @objc let device: MTLDevice
-  @objc let queue: MTLCommandQueue
-  @objc let textureLoader: MTKTextureLoader
-  @objc static let shared: MetalHelper = MetalHelper.init()
-  private override init(){
-    device = MTLCreateSystemDefaultDevice()!
-    queue = device.makeCommandQueue()!
-    textureLoader = MTKTextureLoader.init(device: device)
-    super.init()
-  }
+    @objc let device: MTLDevice
+    @objc let queue: MTLCommandQueue
+    @objc let textureLoader: MTKTextureLoader
+    @objc static let shared: MetalHelper = MetalHelper.init()
+    private override init(){
+        device = MTLCreateSystemDefaultDevice()!
+        queue = device.makeCommandQueue()!
+        textureLoader = MTKTextureLoader.init(device: device)
+        super.init()
+    }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
index 22fb5723ac..8af436d779 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
@@ -16,51 +16,51 @@ import UIKit
 import paddle_mobile
 
 class MultiPredictViewController: UIViewController {
-  var runner1: Runner!
-  var runner2: Runner!
-  override func viewDidLoad() {
-    super.viewDidLoad()
-    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
-    let genet = Genet.init(device: MetalHelper.shared.device)
-    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue)
-    let queue2 = MetalHelper.shared.device.makeCommandQueue()
+    var runner1: Runner!
+    var runner2: Runner!
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
+        let genet = Genet.init(device: MetalHelper.shared.device)
+        runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue)
+        let queue2 = MetalHelper.shared.device.makeCommandQueue()
+        
+        runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue)
+    }
     
-    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue)
-  }
-
-  @IBAction func predictAct(_ sender: Any) {
-    let success = self.runner2.load()
-//    DispatchQueue.global().async {
-      let image1 = UIImage.init(named: "hand.jpg")
-//      let success = self.runner2.load()
-//      if success {
-//        for i in 0..<10000 {
-//          print(i)
-//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
-//            print("result1: ")
-////            print(res)
-//          })
-//        }
-//      } else {
-//        print("load failed")
-//      }
-//      self.runner1.clear()
-//    }
-//    return
-//    DispatchQueue.global().async {
-////      sleep(1)
-//      let image1 = UIImage.init(named: "banana.jpeg")
-////      if success {
-//        for _ in 0..<10 {
-//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
-//            print("result2: ")
-//            print(res)
-//          })
-//        }
-////      } else {
-////        print("load failed")
-////      }
-////      self.runner2.clear()
-//    }
-  }
+    @IBAction func predictAct(_ sender: Any) {
+        let success = self.runner2.load()
+        //    DispatchQueue.global().async {
+        let image1 = UIImage.init(named: "hand.jpg")
+        //      let success = self.runner2.load()
+        //      if success {
+        //        for i in 0..<10000 {
+        //          print(i)
+        //          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+        //            print("result1: ")
+        ////            print(res)
+        //          })
+        //        }
+        //      } else {
+        //        print("load failed")
+        //      }
+        //      self.runner1.clear()
+        //    }
+        //    return
+        //    DispatchQueue.global().async {
+        ////      sleep(1)
+        //      let image1 = UIImage.init(named: "banana.jpeg")
+        ////      if success {
+        //        for _ in 0..<10 {
+        //          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+        //            print("result2: ")
+        //            print(res)
+        //          })
+        //        }
+        ////      } else {
+        ////        print("load failed")
+        ////      }
+        ////      self.runner2.clear()
+        //    }
+    }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
index fac8af2527..ddfc5f770d 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
@@ -20,30 +20,30 @@
 #import <algorithm>
 
 struct NMSParam {
-  
-  float *score_data;
-  
-  float *box_data;
-  
-  float *output;
-  
-  int output_size;
-  
-  std::vector<int> score_dim;
-  
-  std::vector<int> box_dim;
-  
-  float scoreThredshold;
-  
-  int nmsTopK;
-  
-  int keepTopK;
-  
-  float nmsEta;
-  
-  float nmsThreshold;
-  
-  int background_label;
+    
+    float *score_data;
+    
+    float *box_data;
+    
+    float *output;
+    
+    int output_size;
+    
+    std::vector<int> score_dim;
+    
+    std::vector<int> box_dim;
+    
+    float scoreThredshold;
+    
+    int nmsTopK;
+    
+    int keepTopK;
+    
+    float nmsEta;
+    
+    float nmsThreshold;
+    
+    int background_label;
 };
 
 
@@ -53,63 +53,63 @@ constexpr int kBBoxSize = 4;
 template <class T>
 bool SortScorePairDescend(const std::pair<float, T>& pair1,
                           const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
+    return pair1.first > pair2.first;
 }
 
 template <class T>
 static inline void GetMaxScoreIndex(
                                     const std::vector<T>& scores, const T threshold, int top_k,
                                     std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
+    for (size_t i = 0; i < scores.size(); ++i) {
+        if (scores[i] > threshold) {
+            sorted_indices->push_back(std::make_pair(scores[i], i));
+        }
+    }
+    // Sort the score pair according to the scores in descending order
+    std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                     SortScorePairDescend<int>);
+    // Keep top_k scores if needed.
+    if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+        sorted_indices->resize(top_k);
     }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
 }
 
 template <class T>
 static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
+    if (box[2] < box[0] || box[3] < box[1]) {
+        // If coordinate values are is invalid
+        // (e.g. xmax < xmin or ymax < ymin), return 0.
+        return static_cast<T>(0.);
     } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
+        const T w = box[2] - box[0];
+        const T h = box[3] - box[1];
+        if (normalized) {
+            return w * h;
+        } else {
+            // If coordinate values are not within range [0, 1].
+            return (w + 1) * (h + 1);
+        }
     }
-  }
 }
 
 template <class T>
 static inline T JaccardOverlap(const T* box1, const T* box2,
                                const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
+    if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+        box2[3] < box1[1]) {
+        return static_cast<T>(0.);
+    } else {
+        const T inter_xmin = std::max(box1[0], box2[0]);
+        const T inter_ymin = std::max(box1[1], box2[1]);
+        const T inter_xmax = std::min(box1[2], box2[2]);
+        const T inter_ymax = std::min(box1[3], box2[3]);
+        const T inter_w = inter_xmax - inter_xmin;
+        const T inter_h = inter_ymax - inter_ymin;
+        const T inter_area = inter_w * inter_h;
+        const T bbox1_area = BBoxArea<T>(box1, normalized);
+        const T bbox2_area = BBoxArea<T>(box2, normalized);
+        return inter_area / (bbox1_area + bbox2_area - inter_area);
+    }
 }
 
 template <typename T>
@@ -120,40 +120,40 @@ static inline void NMSFast(
                            const T score_threshold, const T nms_threshold,
                            const T eta, const int top_k,
                            std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int num_boxes = bbox_dim[0];
-  // 4: [xmin ymin xmax ymax]
-  int box_size = bbox_dim[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(score_data, num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
+    // The total boxes for each instance.
+    int num_boxes = bbox_dim[0];
+    // 4: [xmin ymin xmax ymax]
+    int box_size = bbox_dim[1];
+    
+    std::vector<T> scores_data(num_boxes);
+    std::copy_n(score_data, num_boxes, scores_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+    
+    selected_indices->clear();
+    T adaptive_threshold = nms_threshold;
+    
+    while (sorted_indices.size() != 0) {
+        const int idx = sorted_indices.front().second;
+        bool keep = true;
+        for (size_t k = 0; k < selected_indices->size(); ++k) {
+            if (keep) {
+                const int kept_idx = (*selected_indices)[k];
+                T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                              bbox_data + kept_idx * box_size, true);
+                keep = overlap <= adaptive_threshold;
+            } else {
+                break;
+            }
+        }
+        if (keep) {
+            selected_indices->push_back(idx);
+        }
+        sorted_indices.erase(sorted_indices.begin());
+        if (keep && eta < 1 && adaptive_threshold > 0.5) {
+            adaptive_threshold *= eta;
+        }
     }
-  }
 }
 
 template <typename T>
@@ -165,48 +165,48 @@ void MultiClassNMS(const T *boxes_data,
                    const int& background_label, const int& nms_top_k,
                    const int& keep_top_k, const T& nms_threshold,
                    const T& nms_eta, const T& score_threshold) {
-  
-  int64_t class_num = score_dim[0];
-  int64_t predict_dim = score_dim[1];
-  int num_det = 0;
-  for (int c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    const T *score_data = scores_data + c * predict_dim;
     
-    /// [c] is key
-    NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
+    int64_t class_num = score_dim[0];
+    int64_t predict_dim = score_dim[1];
+    int num_det = 0;
+    for (int c = 0; c < class_num; ++c) {
+        if (c == background_label) continue;
+        const T *score_data = scores_data + c * predict_dim;
+        
+        /// [c] is key
+        NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
                    nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
+        num_det += (*indices)[c].size();
     }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
+    
+    *num_nmsed_out = num_det;
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+        std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
+        for (const auto& it : *indices) {
+            int label = it.first;
+            const T* sdata = scores_data + label * predict_dim;
+            const std::vector<int>& label_indices = it.second;
+            for (size_t j = 0; j < label_indices.size(); ++j) {
+                int idx = label_indices[j];
+                // PADDLE_ENFORCE_LT(idx, predict_dim);
+                score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
+            }
+        }
+        // Keep top k results per image.
+        std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                         SortScorePairDescend<std::pair<int, int>>);
+        score_index_pairs.resize(keep_top_k);
+        
+        // Store the new indices.
+        std::map<int, std::vector<int>> new_indices;
+        for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+            int label = score_index_pairs[j].second.first;
+            int idx = score_index_pairs[j].second.second;
+            new_indices[label].push_back(idx);
+        }
+        new_indices.swap(*indices);
+        *num_nmsed_out = keep_top_k;
     }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
 }
 
 template <typename T>
@@ -215,69 +215,69 @@ void MultiClassOutput(const T *scores_data,
                       const T *bboxes_data,
                       T *outputs_data,
                       const std::map<int, std::vector<int>>& selected_indices) {
-  int predict_dim = score_dim[1];
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * kBBoxSize;
-      outputs_data[count * kOutputDim] = label;           // label
-      outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-      count++;
+    int predict_dim = score_dim[1];
+    int count = 0;
+    for (const auto& it : selected_indices) {
+        /// one batch
+        int label = it.first;
+        const T* sdata = scores_data + label * predict_dim;
+        const std::vector<int>& indices = it.second;
+        for (size_t j = 0; j < indices.size(); ++j) {
+            int idx = indices[j];
+            const T* bdata = bboxes_data + idx * kBBoxSize;
+            outputs_data[count * kOutputDim] = label;           // label
+            outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
+            // xmin, ymin, xmax, ymax
+            std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+            count++;
+        }
     }
-  }
 }
 
 void MultiClassNMSCompute(NMSParam *param) {
-  assert(param->score_dim[0] == 1);
-  assert(param->box_dim[0] == 1);
-  assert (param->score_dim.size() == 3);
-  assert(param->box_dim.size() == 3);
-
-  float* outputs;
-  auto background_label = param->background_label;
-  auto nms_top_k = param->nmsTopK;
-  auto keep_top_k = param->keepTopK;
-  auto nms_threshold = param->nmsThreshold;
-  auto nms_eta = param->nmsEta;
-  auto score_threshold = param->scoreThredshold;
-
-  std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
-  std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
-  
-  std::vector<int> batch_starts = {0};
-  
-  std::map<int, std::vector<int>> indices;
-  int num_nmsed_out = 0;
-  
-  MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
-                       background_label, nms_top_k, keep_top_k, nms_threshold,
-                       nms_eta, score_threshold);
-  batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-
-  int output_size = 0;
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    outputs = new float[1];
-    outputs[0] = -1;
-    output_size = 1;
-  } else {
-    outputs = new float[num_kept * kOutputDim];
-    int64_t s = batch_starts[0];
-    int64_t e = batch_starts[1];
-    if (e > s) {
-      MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
+    assert(param->score_dim[0] == 1);
+    assert(param->box_dim[0] == 1);
+    assert (param->score_dim.size() == 3);
+    assert(param->box_dim.size() == 3);
+    
+    float* outputs;
+    auto background_label = param->background_label;
+    auto nms_top_k = param->nmsTopK;
+    auto keep_top_k = param->keepTopK;
+    auto nms_threshold = param->nmsThreshold;
+    auto nms_eta = param->nmsEta;
+    auto score_threshold = param->scoreThredshold;
+    
+    std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
+    std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
+    
+    std::vector<int> batch_starts = {0};
+    
+    std::map<int, std::vector<int>> indices;
+    int num_nmsed_out = 0;
+    
+    MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
+                         background_label, nms_top_k, keep_top_k, nms_threshold,
+                         nms_eta, score_threshold);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    
+    int output_size = 0;
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+        outputs = new float[1];
+        outputs[0] = -1;
+        output_size = 1;
+    } else {
+        outputs = new float[num_kept * kOutputDim];
+        int64_t s = batch_starts[0];
+        int64_t e = batch_starts[1];
+        if (e > s) {
+            MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
+        }
+        output_size = num_kept * kOutputDim;
     }
-    output_size = num_kept * kOutputDim;
-  }
-  param->output = outputs;
-  param->output_size = output_size;
+    param->output = outputs;
+    param->output_size = output_size;
 }
 
 @implementation CPUResult
@@ -286,31 +286,31 @@ void MultiClassNMSCompute(NMSParam *param) {
 @implementation NMSCompute
 
 -(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
-  NMSParam param;
-  param.box_data = bbox;
-  param.score_data = score;
-  param.background_label = self.background_label;
-  param.scoreThredshold = self.scoreThredshold;
-  param.nmsTopK = self.nmsTopK;
-  param.keepTopK = self.keepTopK;
-  param.nmsEta = self.nmsEta;
-  param.nmsThreshold = self.nmsThreshold;
-  std::vector<int> score_dim;
-  for (int i = 0; i < self.scoreDim.count; ++i) {
-    score_dim.push_back(self.scoreDim[i].intValue);
-  }
-  param.score_dim = score_dim;
-  
-  std::vector<int> box_dim;
-  for (int i = 0; i < self.bboxDim.count; ++i) {
-    box_dim.push_back(self.bboxDim[i].intValue);
-  }
-  param.box_dim = box_dim;
-  MultiClassNMSCompute(&param);
-  CPUResult *cr = [[CPUResult alloc] init];
-  cr.output = param.output;
-  cr.outputSize = param.output_size;
-  return cr;
+    NMSParam param;
+    param.box_data = bbox;
+    param.score_data = score;
+    param.background_label = self.background_label;
+    param.scoreThredshold = self.scoreThredshold;
+    param.nmsTopK = self.nmsTopK;
+    param.keepTopK = self.keepTopK;
+    param.nmsEta = self.nmsEta;
+    param.nmsThreshold = self.nmsThreshold;
+    std::vector<int> score_dim;
+    for (int i = 0; i < self.scoreDim.count; ++i) {
+        score_dim.push_back(self.scoreDim[i].intValue);
+    }
+    param.score_dim = score_dim;
+    
+    std::vector<int> box_dim;
+    for (int i = 0; i < self.bboxDim.count; ++i) {
+        box_dim.push_back(self.bboxDim[i].intValue);
+    }
+    param.box_dim = box_dim;
+    MultiClassNMSCompute(&param);
+    CPUResult *cr = [[CPUResult alloc] init];
+    cr.output = param.output;
+    cr.outputSize = param.output_size;
+    return cr;
 }
 
 @end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
index 91bf014e9f..b248e53bac 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
@@ -16,37 +16,37 @@ import Foundation
 import paddle_mobile
 
 public class Genet: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
-    preprocessKernel = GenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 128, 128, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device: device,
-               inParamPointer: inParamPointer,
-               inParamSize: inParamSize,
-               inModelPointer: inModelPointer,
-               inModelSize: inModelSize)
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    preprocessKernel = GenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 128, 128, 3])
-  }
-
-  class GenetPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
-      super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
+        preprocessKernel = GenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 128, 128, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
+    
+    @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device: device,
+                   inParamPointer: inParamPointer,
+                   inParamSize: inParamSize,
+                   inModelPointer: inModelPointer,
+                   inModelSize: inModelSize)
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        preprocessKernel = GenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 128, 128, 3])
+    }
+    
+    class GenetPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
+            super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
+    }
+    
+    override  public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0]) ... "
     }
-  }
-  
-  override  public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0]) ... "
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
index d35fde97d7..608cd3180b 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
@@ -16,53 +16,53 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet: Net{
-  
-  class MobilenetPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
-    }
-  }
-  
-  class PreWords {
-    var contents: [String] = []
-    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-        let string = try! String.init(contentsOfFile: filePath)
-        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+    
+    class MobilenetPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
         }
-      }else{
-        fatalError("no file call \(fileName)")
-      }
     }
-    subscript(index: Int) -> String {
-      return contents[index]
+    
+    class PreWords {
+        var contents: [String] = []
+        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+                let string = try! String.init(contentsOfFile: filePath)
+                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
+                }
+            }else{
+                fatalError("no file call \(fileName)")
+            }
+        }
+        subscript(index: Int) -> String {
+            return contents[index]
+        }
     }
-  }
-  
-  let labels = PreWords.init(fileName: "synset")
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    let resPointer = res[0].result
-    var s: [String] = []
-    (0..<res[0].capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
-      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    
+    let labels = PreWords.init(fileName: "synset")
+    
+    override public func resultStr(res: [ResultHolder]) -> String {
+        let resPointer = res[0].result
+        var s: [String] = []
+        (0..<res[0].capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+        }
+        return s.joined(separator: "\n")
     }
-    return s.joined(separator: "\n")
-  }
     
-  override public init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"    
-//    metalLoadMode = .LoadMetalInCustomMetalLib
-//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
-    preprocessKernel = MobilenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
+    override public init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"    
+        //    metalLoadMode = .LoadMetalInCustomMetalLib
+        //    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
+        preprocessKernel = MobilenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
index 1ede49826d..1e644c3d54 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
@@ -16,18 +16,18 @@ import Foundation
 import paddle_mobile
 
 public class MobileNetCombined: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null"
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  override  public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0]) ... "
-  }
-  
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null"
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
+    
+    override  public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0]) ... "
+    }
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
index 140aefdfb3..38d20557d2 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
@@ -16,84 +16,84 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet_ssd_hand: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 2
-    modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 300, 300, 3])
-  }
-  
-  @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize)
-    except = 2
-    modelPath = ""
-    paramPath = ""
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 300, 300, 3])
-  }
-  
-  class MobilenetssdPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 2
+        modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 300, 300, 3])
     }
-  }
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0])"
-  }
-  
-  override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-
-//    guard let interRes = paddleMobileRes.intermediateResults else {
-//      fatalError(" need have inter result ")
-//    }
-//
-//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
-//      fatalError(" need score ")
-//    }
-//
-//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
-//      fatalError()
-//    }
-//
-//    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
-////    print("score: ")
-////    print(scoreFormatArr.strideArray())
-////
-//    var bboxArr = bbox.metalTexture.float32Array()
-////    print("bbox: ")
-////    print(bboxArr.strideArray())
-//
-//    let nmsCompute = NMSCompute.init()
-//    nmsCompute.scoreThredshold = 0.01
-//    nmsCompute.nmsTopK = 400
-//    nmsCompute.keepTopK = 200
-//    nmsCompute.nmsEta = 1.0
-//    nmsCompute.nmsThreshold = 0.45
-//    nmsCompute.background_label = 0;
-//
-//    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
-//
-//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
-//    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
-//      fatalError( " result error " )
-//    }
-//
-//    let output: [Float32] = result.map { $0.floatValue }
-//
-//
-//    return output
-    fatalError()
-  }
-  
-
-  
- 
+    
+    @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize)
+        except = 2
+        modelPath = ""
+        paramPath = ""
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 300, 300, 3])
+    }
+    
+    class MobilenetssdPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
+    }
+    
+    override public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0])"
+    }
+    
+    override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
+        
+        //    guard let interRes = paddleMobileRes.intermediateResults else {
+        //      fatalError(" need have inter result ")
+        //    }
+        //
+        //    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+        //      fatalError(" need score ")
+        //    }
+        //
+        //    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+        //      fatalError()
+        //    }
+        //
+        //    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
+        ////    print("score: ")
+        ////    print(scoreFormatArr.strideArray())
+        ////
+        //    var bboxArr = bbox.metalTexture.float32Array()
+        ////    print("bbox: ")
+        ////    print(bboxArr.strideArray())
+        //
+        //    let nmsCompute = NMSCompute.init()
+        //    nmsCompute.scoreThredshold = 0.01
+        //    nmsCompute.nmsTopK = 400
+        //    nmsCompute.keepTopK = 200
+        //    nmsCompute.nmsEta = 1.0
+        //    nmsCompute.nmsThreshold = 0.45
+        //    nmsCompute.background_label = 0;
+        //
+        //    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+        //
+        //    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+        //    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+        //      fatalError( " result error " )
+        //    }
+        //
+        //    let output: [Float32] = result.map { $0.floatValue }
+        //
+        //
+        //    return output
+        fatalError()
+    }
+    
+    
+    
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
index 134a07bba6..76feb0ecd0 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
@@ -16,137 +16,137 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet_ssd_AR: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 2
-    modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 160, 160, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize)
-    except = 2
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 160, 160, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  class MobilenetssdPreProccess: CusomKernel {
-    init(device: MTLDevice)  {
-      let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 2
+        modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 160, 160, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
     }
-  }
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0])"
-  }
-  
-  override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-    fatalError()
-//    guard let interRes = paddleMobileRes.intermediateResults else {
-//      fatalError(" need have inter result ")
-//    }
-//
-//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
-//      fatalError(" need score ")
-//    }
-//
-//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
-//      fatalError()
-//    }
     
-//    let startDate = Date.init()
+    @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize)
+        except = 2
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 160, 160, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
     
-//    print("scoreFormatArr: ")
-//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
-//
-//    print("bbox arr: ")
-//
-//    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
+    class MobilenetssdPreProccess: CusomKernel {
+        init(device: MTLDevice)  {
+            let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
+    }
     
-//    let nmsCompute = NMSCompute.init()
-//    nmsCompute.scoreThredshold = 0.25
-//    nmsCompute.nmsTopK = 100
-//    nmsCompute.keepTopK = 100
-//    nmsCompute.nmsEta = 1.0
-//    nmsCompute.nmsThreshold = 0.449999988
-//    nmsCompute.background_label = 0;
-//    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
-//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
-//    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
-//      fatalError( " result error " )
-//    }
-//    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
-//    for i in 0..<Int(result.outputSize) {
-//
-//      print("i \(i) : \(result.output[i])")
-//    }
-//    print(Date.init().timeIntervalSince(startDate))
-
-//    print(resultHolder.result![0])
-//    return resultHolder
-  }
-  
-//  override func updateProgram(program: Program) {
-  
-//    for i in [56, 66, 76, 86, 93, 99] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      let output = opDesc.outputs["Out"]!.first!
-//      let v = program.scope[output]!
-//      let originTexture = v as! Texture
-//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
-//      
-//      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
-//      
-//      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
-//      
-//      program.scope[output] = originTexture
-//      
-//      if i == 99 {
-//        opDesc.attrs["axis"] = 0
-//      } else {
-//        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
-//      }
-//    }
-//    
-//    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      let output = opDesc.outputs["Out"]!.first!
-//      let v = program.scope[output]!
-//      
-//      
-//      
-//      let originTexture = v as! Texture
-//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-//      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
-//    }
-//    
-//    for i in [60, 101, 90, 97, 70, 80] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      let output = opDesc.outputs["Out"]!.first!
-//      let v = program.scope[output]!
-//      let originTexture = v as! Texture
-//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-//      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
-//    }
-//    
-//    for i in [102] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      for output in opDesc.outputs["Out"]! {
-//        let v = program.scope[output]!
-//        let originTexture = v as! Texture
-//        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-//      }
-//      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
-//      print(" split axis \(opDesc.attrs["axis"])")
-//    }
+    override public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0])"
+    }
+    
+    override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
+        fatalError()
+        //    guard let interRes = paddleMobileRes.intermediateResults else {
+        //      fatalError(" need have inter result ")
+        //    }
+        //
+        //    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
+        //      fatalError(" need score ")
+        //    }
+        //
+        //    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
+        //      fatalError()
+        //    }
+        
+        //    let startDate = Date.init()
+        
+        //    print("scoreFormatArr: ")
+        //print((0..<score.capacity).map{ score.result[$0] }.strideArray())
+        //
+        //    print("bbox arr: ")
+        //
+        //    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
+        
+        //    let nmsCompute = NMSCompute.init()
+        //    nmsCompute.scoreThredshold = 0.25
+        //    nmsCompute.nmsTopK = 100
+        //    nmsCompute.keepTopK = 100
+        //    nmsCompute.nmsEta = 1.0
+        //    nmsCompute.nmsThreshold = 0.449999988
+        //    nmsCompute.background_label = 0;
+        //    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
+        //    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
+        //    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
+        //      fatalError( " result error " )
+        //    }
+        //    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+        //    for i in 0..<Int(result.outputSize) {
+        //
+        //      print("i \(i) : \(result.output[i])")
+        //    }
+        //    print(Date.init().timeIntervalSince(startDate))
+        
+        //    print(resultHolder.result![0])
+        //    return resultHolder
+    }
+    
+    //  override func updateProgram(program: Program) {
+    
+    //    for i in [56, 66, 76, 86, 93, 99] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      let output = opDesc.outputs["Out"]!.first!
+    //      let v = program.scope[output]!
+    //      let originTexture = v as! Texture
+    //      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
+    //      
+    //      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
+    //      
+    //      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
+    //      
+    //      program.scope[output] = originTexture
+    //      
+    //      if i == 99 {
+    //        opDesc.attrs["axis"] = 0
+    //      } else {
+    //        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    //      }
+    //    }
+    //    
+    //    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      let output = opDesc.outputs["Out"]!.first!
+    //      let v = program.scope[output]!
+    //      
+    //      
+    //      
+    //      let originTexture = v as! Texture
+    //      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+    //      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    //    }
+    //    
+    //    for i in [60, 101, 90, 97, 70, 80] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      let output = opDesc.outputs["Out"]!.first!
+    //      let v = program.scope[output]!
+    //      let originTexture = v as! Texture
+    //      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+    //      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    //    }
+    //    
+    //    for i in [102] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      for output in opDesc.outputs["Out"]! {
+    //        let v = program.scope[output]!
+    //        let originTexture = v as! Texture
+    //        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+    //      }
+    //      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    //      print(" split axis \(opDesc.attrs["axis"])")
+    //    }
     // 99
-//  }
-  
+    //  }
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
index 9bd2c26e34..99bd8f4a03 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
@@ -17,9 +17,9 @@ using namespace metal;
 
 
 kernel void mobilenet_preprocess(
-                       texture2d<float, access::read> inTexture [[texture(0)]],
-                       texture2d<float, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
+                                 texture2d<float, access::read> inTexture [[texture(0)]],
+                                 texture2d<float, access::write> outTexture [[texture(1)]],
+                                 uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -31,9 +31,9 @@ kernel void mobilenet_preprocess(
 }
 
 kernel void mobilenet_preprocess_half(
-                       texture2d<half, access::read> inTexture [[texture(0)]],
-                       texture2d<half, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
+                                      texture2d<half, access::read> inTexture [[texture(0)]],
+                                      texture2d<half, access::write> outTexture [[texture(1)]],
+                                      uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -45,9 +45,9 @@ kernel void mobilenet_preprocess_half(
 }
 
 kernel void mobilenet_ssd_preprocess(
-                       texture2d<float, access::read> inTexture [[texture(0)]],
-                       texture2d<float, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
+                                     texture2d<float, access::read> inTexture [[texture(0)]],
+                                     texture2d<float, access::write> outTexture [[texture(1)]],
+                                     uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -59,9 +59,9 @@ kernel void mobilenet_ssd_preprocess(
 }
 
 kernel void mobilenet_ssd_preprocess_half(
-                            texture2d<half, access::read> inTexture [[texture(0)]],
-                            texture2d<half, access::write> outTexture [[texture(1)]],
-                            uint2 gid [[thread_position_in_grid]])
+                                          texture2d<half, access::read> inTexture [[texture(0)]],
+                                          texture2d<half, access::write> outTexture [[texture(1)]],
+                                          uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -74,44 +74,44 @@ kernel void mobilenet_ssd_preprocess_half(
 
 kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
-  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void genet_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
-  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void mobilent_ar_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
-  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void mobilent_ar_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
-  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
index f5f4ef81e9..caaef97695 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
@@ -17,18 +17,18 @@ import Foundation
 import paddle_mobile
 
 public class YoloNet: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null"
-    inputDim = Dim.init(inDim: [1, 416, 416, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-
-  override  public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0]) ... "
-  }
-  
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null"
+        inputDim = Dim.init(inDim: [1, 416, 416, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
+    
+    override  public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0]) ... "
+    }
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
index 586fc91a7f..5bef9317b1 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
@@ -34,83 +34,83 @@
 @implementation LoadPointerViewController
 
 - (void)viewDidLoad {
-  [super viewDidLoad];
-  
-  self.imageView.image = [UIImage imageNamed:@"banana.jpeg"];
-  
-  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path;
-  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path;
-
-  long fileSize;
-  FILE *fp;
-  fp = fopen([modelPath UTF8String], "rb");
-  fseek(fp, 0, SEEK_END);
-  fileSize = ftell(fp);
-  rewind(fp);
-  void *buffer = malloc(fileSize);
-  fread(buffer, 1, fileSize, fp);
-  fclose(fp);
-  
-  long paramfileSize;
-  FILE *parmaFilePointer;
-  parmaFilePointer = fopen([paramPath UTF8String], "rb");
-  fseek(parmaFilePointer, 0, SEEK_END);
-  paramfileSize = ftell(parmaFilePointer);
-  rewind(parmaFilePointer);
-  void *parmaBuffer = malloc(paramfileSize);
-  fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
-  fclose(parmaFilePointer);
-  
-  _modelConfig = [[ModelConfig alloc] init];
-  _modelConfig.modelPointer = buffer;
-  _modelConfig.modelSize = (int)fileSize;
-  _modelConfig.paramPointer = parmaBuffer;
-  _modelConfig.paramSize = (int)paramfileSize;
+    [super viewDidLoad];
+    
+    self.imageView.image = [UIImage imageNamed:@"banana.jpeg"];
+    
+    NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path;
+    NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path;
+    
+    long fileSize;
+    FILE *fp;
+    fp = fopen([modelPath UTF8String], "rb");
+    fseek(fp, 0, SEEK_END);
+    fileSize = ftell(fp);
+    rewind(fp);
+    void *buffer = malloc(fileSize);
+    fread(buffer, 1, fileSize, fp);
+    fclose(fp);
+    
+    long paramfileSize;
+    FILE *parmaFilePointer;
+    parmaFilePointer = fopen([paramPath UTF8String], "rb");
+    fseek(parmaFilePointer, 0, SEEK_END);
+    paramfileSize = ftell(parmaFilePointer);
+    rewind(parmaFilePointer);
+    void *parmaBuffer = malloc(paramfileSize);
+    fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
+    fclose(parmaFilePointer);
+    
+    _modelConfig = [[ModelConfig alloc] init];
+    _modelConfig.modelPointer = buffer;
+    _modelConfig.modelSize = (int)fileSize;
+    _modelConfig.paramPointer = parmaBuffer;
+    _modelConfig.paramSize = (int)paramfileSize;
 }
 - (IBAction)loaderButtonPressed:(id)sender {
-  self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig];
-  _loaded = [self.paddleMobile load];
-  NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败");
+    self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig];
+    _loaded = [self.paddleMobile load];
+    NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败");
 }
 - (IBAction)predictButtonPressed:(id)sender {
-  [self predict];
+    [self predict];
 }
 
 - (void)predict {
-  UIImage *image = self.imageView.image;
-  if (!image) {
-    NSLog(@" image is nil");
-    return;
-  }
-  id<MTLTexture> texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil];
-  _texture = texture;
-  if (!_texture) {
-    NSLog(@" texture is nil");
-    return;
-  }
-  
-  if (!self.loaded) {
-    NSLog(@" not load ");
-    return;
-  }
-  
-  NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
-  NSInteger max = 1;
-  for (int i = 0;i < max; i ++) {
-    [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
-      if (success) {
-        if (i == max -1) {
-          double time = [[NSDate date] timeIntervalSince1970] - startTime;
-          time = (time/max)*1000;
-          NSLog(@"gap ==== %fms",time);
-        }
-      }
-    }];
-  }
+    UIImage *image = self.imageView.image;
+    if (!image) {
+        NSLog(@" image is nil");
+        return;
+    }
+    id<MTLTexture> texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil];
+    _texture = texture;
+    if (!_texture) {
+        NSLog(@" texture is nil");
+        return;
+    }
+    
+    if (!self.loaded) {
+        NSLog(@" not load ");
+        return;
+    }
+    
+    NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
+    NSInteger max = 1;
+    for (int i = 0;i < max; i ++) {
+        [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
+            if (success) {
+                if (i == max -1) {
+                    double time = [[NSDate date] timeIntervalSince1970] - startTime;
+                    time = (time/max)*1000;
+                    NSLog(@"gap ==== %fms",time);
+                }
+            }
+        }];
+    }
 }
 - (IBAction)clear:(id)sender {
-  [self.paddleMobile clear];
-  self.loaded = NO;
+    [self.paddleMobile clear];
+    self.loaded = NO;
 }
 
 @end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
index cd99ddad43..d45d7daaa1 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
@@ -16,8 +16,8 @@
 #import <Foundation/Foundation.h>
 
 typedef enum : NSUInteger {
-  SuperResolutionNetType,
-  MobileNetSSDType
+    SuperResolutionNetType,
+    MobileNetSSDType
 } NetType;
 
 @interface PaddleMobileGPUResult: NSObject
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
index 670753fd9f..881a6cb505 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
@@ -30,75 +30,75 @@
 
 @implementation PaddleMobileGPUResult
 - (void)setOutputResult:(ResultHolder *)resultHolder {
-  self.resultHolder = resultHolder;
-  self.output = resultHolder.result;
-  self.outputSize = resultHolder.capacity;
+    self.resultHolder = resultHolder;
+    self.output = resultHolder.result;
+    self.outputSize = resultHolder.capacity;
 }
 
 -(void)releaseOutput {
-  [self.resultHolder releasePointer];
+    [self.resultHolder releasePointer];
 }
 @end
 
 @interface PaddleMobileGPU ()
 {
-  Runner *runner;
+    Runner *runner;
 }
 @end
 
 @implementation PaddleMobileGPU
 
 -(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config {
-  self = [super init];
-  if (self) {
-    Net *net = nil;
-    if (netType == SuperResolutionNetType) {
-      net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
-    } else if (netType == MobileNetSSDType) {
-      net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
+    self = [super init];
+    if (self) {
+        Net *net = nil;
+        if (netType == SuperResolutionNetType) {
+            net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
+        } else if (netType == MobileNetSSDType) {
+            net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
+        }
+        runner = [[Runner alloc] initInNet:net commandQueue:queue];
     }
-    runner = [[Runner alloc] initInNet:net commandQueue:queue];
-  }
-  return self;
+    return self;
 }
 
 -(BOOL)load {
-  return [runner load];
+    return [runner load];
 }
 
 -(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSArray <NSNumber *>*> *))completion {
-  
-  [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
-    NSMutableArray<NSMutableArray <NSNumber *>*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count];
-    for (int i = 0; i < resultArr.count; ++i) {
-      ResultHolder *resultHolder = resultArr[i];
-      NSMutableArray <NSNumber *>*res = [NSMutableArray arrayWithCapacity:resultHolder.capacity];
-      for (int j = 0; j < resultHolder.capacity; ++j) {
-        [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]];
-      }
-      [ocResultArray addObject:res];
-      [resultHolder releasePointer];
-    }
-    completion(success, ocResultArray);
-  }];
+    
+    [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
+        NSMutableArray<NSMutableArray <NSNumber *>*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count];
+        for (int i = 0; i < resultArr.count; ++i) {
+            ResultHolder *resultHolder = resultArr[i];
+            NSMutableArray <NSNumber *>*res = [NSMutableArray arrayWithCapacity:resultHolder.capacity];
+            for (int j = 0; j < resultHolder.capacity; ++j) {
+                [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]];
+            }
+            [ocResultArray addObject:res];
+            [resultHolder releasePointer];
+        }
+        completion(success, ocResultArray);
+    }];
 }
 
 -(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, NSArray <PaddleMobileGPUResult *> *))completion {
-  [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
-    NSMutableArray <PaddleMobileGPUResult *> *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count];
-    for (int i = 0; i < resultArr.count; ++i) {
-      ResultHolder *result = resultArr[i];
-      PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
-      gpuResult.dim = result.dim;
-      [gpuResult setOutputResult:result];
-      [ocResultArr addObject:gpuResult];
-    }
-    completion(success, ocResultArr);
-  }];
+    [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
+        NSMutableArray <PaddleMobileGPUResult *> *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count];
+        for (int i = 0; i < resultArr.count; ++i) {
+            ResultHolder *result = resultArr[i];
+            PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
+            gpuResult.dim = result.dim;
+            [gpuResult setOutputResult:result];
+            [ocResultArr addObject:gpuResult];
+        }
+        completion(success, ocResultArr);
+    }];
 }
 
 -(void)clear {
-  [runner clear];
+    [runner clear];
 }
 
 @end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
index d2bebb2668..50dd29095e 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
@@ -16,57 +16,57 @@ import Foundation
 import paddle_mobile
 
 @objc public class SuperResolutionNet: Net{
-  override public func resultStr(res: [ResultHolder]) -> String {
-    return "未实现"
-  }
-  
-  public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device: device)
-    except = 0
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    self.paramPointer = inParamPointer
-    self.paramSize = inParamSize
-    self.modelPointer = inModelPointer
-    self.modelSize = inModelSize
-  }
+    override public func resultStr(res: [ResultHolder]) -> String {
+        return "未实现"
+    }
+    
+    public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device: device)
+        except = 0
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        self.paramPointer = inParamPointer
+        self.paramSize = inParamSize
+        self.modelPointer = inModelPointer
+        self.modelSize = inModelSize
+    }
+    
+    @objc override public init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null"
+        preprocessKernel = nil
+        inputDim = Dim.init(inDim: [1, 224, 224, 1])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
     
-  @objc override public init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null"
-    preprocessKernel = nil
-    inputDim = Dim.init(inDim: [1, 224, 224, 1])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  override public func updateProgram(program: Program) {
-    // n h w c
-    for block in program.programDesc.blocks {
-      for varDesc in block.vars {
-        if !varDesc.persistable {
-          if varDesc.type == .LodTensor {
-            let varEle = program.scope.vars[varDesc.name]
-            if let texture = varEle as? Texture {
-              let newDim = Dim.init(inDim: [texture.dim[0],  inputDim[1], inputDim[2], texture.tensorDim[1]])
-              print(" var desc name " + varDesc.name + " new dim" + "\(newDim)")
-              
-              texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim)
-              texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
-              
-              let output: FetchHolder = program.scope.output() as! FetchHolder
-              output.dim = newDim
-              output.capacity = newDim.numel()
-              output.paddedCapacity = newDim.numel() * 4
-              output.initBuffer(device: device)
+    override public func updateProgram(program: Program) {
+        // n h w c
+        for block in program.programDesc.blocks {
+            for varDesc in block.vars {
+                if !varDesc.persistable {
+                    if varDesc.type == .LodTensor {
+                        let varEle = program.scope.vars[varDesc.name]
+                        if let texture = varEle as? Texture {
+                            let newDim = Dim.init(inDim: [texture.dim[0],  inputDim[1], inputDim[2], texture.tensorDim[1]])
+                            print(" var desc name " + varDesc.name + " new dim" + "\(newDim)")
+                            
+                            texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim)
+                            texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
+                            
+                            let output: FetchHolder = program.scope.output() as! FetchHolder
+                            output.dim = newDim
+                            output.capacity = newDim.numel()
+                            output.paddedCapacity = newDim.numel() * 4
+                            output.initBuffer(device: device)
+                        }
+                    }
+                }
             }
-          }
         }
-      }
     }
-  }
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
index f9e841f9c2..0080aa80f6 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
@@ -4,28 +4,28 @@ import Foundation
 import QuartzCore
 
 public class FPSCounter {
-  private(set) public var fps: Double = 0
-
-  var frames = 0
-  var startTime: CFTimeInterval = 0
-
-  public func start() {
-    frames = 0
-    startTime = CACurrentMediaTime()
-  }
-
-  public func frameCompleted() {
-    frames += 1
-    let now = CACurrentMediaTime()
-    let elapsed = now - startTime
-    if elapsed > 0.1 {
-      let current = Double(frames) / elapsed
-      let smoothing = 0.75
-      fps = smoothing*fps + (1 - smoothing)*current
-      if elapsed > 1 {
+    private(set) public var fps: Double = 0
+    
+    var frames = 0
+    var startTime: CFTimeInterval = 0
+    
+    public func start() {
         frames = 0
         startTime = CACurrentMediaTime()
-      }
     }
-  }
+    
+    public func frameCompleted() {
+        frames += 1
+        let now = CACurrentMediaTime()
+        let elapsed = now - startTime
+        if elapsed > 0.1 {
+            let current = Double(frames) / elapsed
+            let smoothing = 0.75
+            fps = smoothing*fps + (1 - smoothing)*current
+            if elapsed > 1 {
+                frames = 0
+                startTime = CACurrentMediaTime()
+            }
+        }
+    }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
index c235ed2f03..cb63954487 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
@@ -6,15 +6,15 @@ import AVFoundation
 
 @available(iOS 10.0, *)
 @objc public protocol VideoCaptureDelegate: NSObjectProtocol {
-  @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
     @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
     @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
     @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
 }
 
 /**
-  Simple interface to the iPhone's camera.
-*/
+ Simple interface to the iPhone's camera.
+ */
 @available(iOS 10.0, *)
 public class VideoCapture: NSObject {
     public var previewLayer: AVCaptureVideoPreviewLayer?
@@ -35,9 +35,9 @@ public class VideoCapture: NSObject {
         self.cameraPosition = position
         super.init()
     }
-
+    
     public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
-                    completion: @escaping (Bool) -> Void) {
+                      completion: @escaping (Bool) -> Void) {
         queue.async {
             let success = self.setUpCamera(sessionPreset: sessionPreset)
             DispatchQueue.main.async {
@@ -45,7 +45,7 @@ public class VideoCapture: NSObject {
             }
         }
     }
-
+    
     func fontCamera() -> AVCaptureDevice? {
         let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
         return deveices.first
@@ -62,7 +62,7 @@ public class VideoCapture: NSObject {
         
         captureSession.beginConfiguration()
         captureSession.sessionPreset = sessionPreset
-
+        
         var oCaptureDevice: AVCaptureDevice?
         switch cameraPosition {
         case .back:
@@ -79,56 +79,56 @@ public class VideoCapture: NSObject {
             print("Error: no video devices available")
             return false
         }
-
+        
         guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
             print("Error: could not create AVCaptureDeviceInput")
             return false
         }
-
+        
         if captureSession.canAddInput(videoInput) {
             captureSession.addInput(videoInput)
         }
-
+        
         let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
         previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
         previewLayer.connection?.videoOrientation = self.videoOrientation
         self.previewLayer = previewLayer
-
+        
         let settings: [String : Any] = [
-        kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
+            kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
         ]
-
+        
         videoOutput.videoSettings = settings
         videoOutput.alwaysDiscardsLateVideoFrames = true
         videoOutput.setSampleBufferDelegate(self, queue: queue)
         if captureSession.canAddOutput(videoOutput) {
             captureSession.addOutput(videoOutput)
         }
-
+        
         // We want the buffers to be in portrait orientation otherwise they are
         // rotated by 90 degrees. Need to set this _after_ addOutput()!
         videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
-
+        
         if captureSession.canAddOutput(photoOutput) {
             captureSession.addOutput(photoOutput)
         }
-
+        
         captureSession.commitConfiguration()
         return true
     }
-
+    
     public func start() {
         if !captureSession.isRunning {
             captureSession.startRunning()
         }
     }
-
+    
     public func stop() {
         if captureSession.isRunning {
             captureSession.stopRunning()
         }
     }
-
+    
     /* Captures a single frame of the camera input. */
     public func capturePhoto() {
         let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
@@ -139,7 +139,7 @@ public class VideoCapture: NSObject {
         ]
         photoOutput.capturePhoto(with: settings, delegate: self)
     }
-
+    
     func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
         if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
             let width = CVPixelBufferGetWidth(imageBuffer)
@@ -152,7 +152,7 @@ public class VideoCapture: NSObject {
         }
         return nil
     }
-
+    
     func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
         if let sampleBuffer = sampleBuffer,
             let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
@@ -172,47 +172,47 @@ public class VideoCapture: NSObject {
 
 @available(iOS 10.0, *)
 extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
-  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
-    // Because lowering the capture device's FPS looks ugly in the preview,
-    // we capture at full speed but only call the delegate at its desired
-    // framerate. If `fps` is -1, we run at the full framerate.
-    let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
-    let deltaTime = timestamp - lastTimestamp
-    if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
-        lastTimestamp = timestamp
-        self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
-        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
-            let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
-            delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+    public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+        // Because lowering the capture device's FPS looks ugly in the preview,
+        // we capture at full speed but only call the delegate at its desired
+        // framerate. If `fps` is -1, we run at the full framerate.
+        let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+        let deltaTime = timestamp - lastTimestamp
+        if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
+            lastTimestamp = timestamp
+            self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
+            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
+                let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
+                delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+            }
         }
     }
-  }
-
-  public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
-    print("dropped frame")
-  }
+    
+    public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+        print("dropped frame")
+    }
 }
 
 @available(iOS 10.0, *)
 extension VideoCapture: AVCapturePhotoCaptureDelegate {
-  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
-                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
-                          previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
-                          resolvedSettings: AVCaptureResolvedPhotoSettings,
-                          bracketSettings: AVCaptureBracketedStillImageSettings?,
-                          error: Error?) {
-    var imageTexture: MTLTexture?
-    var previewImage: UIImage?
-    if error == nil {
-        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
-            imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
-            self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
-        }
-        
-        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
-            previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
-            self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+    public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
+                            didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
+                            previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
+                            resolvedSettings: AVCaptureResolvedPhotoSettings,
+                            bracketSettings: AVCaptureBracketedStillImageSettings?,
+                            error: Error?) {
+        var imageTexture: MTLTexture?
+        var previewImage: UIImage?
+        if error == nil {
+            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
+                imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
+                self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
+            }
+            
+            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
+                previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
+                self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+            }
         }
     }
-  }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
index 612a986d85..42d6c2b7ab 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -19,265 +19,242 @@ import paddle_mobile
 import MetalPerformanceShaders
 
 class FileReader {
-  let file: UnsafeMutablePointer<FILE>
-  let fileSize: Int
-  init(paramPath: String) throws {
-    guard let tmpFile = fopen(paramPath, "rb") else {
-      throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+    let file: UnsafeMutablePointer<FILE>
+    let fileSize: Int
+    init(paramPath: String) throws {
+        guard let tmpFile = fopen(paramPath, "rb") else {
+            throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+        }
+        file = tmpFile
+        fseek(file, 0, SEEK_END)
+        fileSize = ftell(file)
+        guard fileSize > 0 else {
+            throw PaddleMobileError.loaderError(message: "param file size is too small")
+        }
+        rewind(file)
+    }
+    
+    func read<T>() -> UnsafeMutablePointer<T> {
+        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size * fileSize)
+        fread(ptr, fileSize, 1, file)
+        return ptr
     }
-    file = tmpFile
-    fseek(file, 0, SEEK_END)
-    fileSize = ftell(file)
-    guard fileSize > 0 else {
-      throw PaddleMobileError.loaderError(message: "param file size is too small")
+    
+    deinit {
+        fclose(file)
     }
-    rewind(file)
-  }
-  
-  func read<T>() -> UnsafeMutablePointer<T> {
-    let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size * fileSize)
-    fread(ptr, fileSize, 1, file)
-    return ptr
-  }
-  
-  deinit {
-    fclose(file)
-  }
 }
 
 enum Platform {
-  case GPU
+    case GPU
 }
 
 let platformSupport: [(Platform, String)] = [(.GPU, "GPU")]
 
 enum SupportModel: String{
-  case yolo               = "yolo"
-  case mobilenet_combined = "mobilenet_combined"
-  case super_resolution   = "superresoltion"
-  case mobilenet          = "mobilenet"
-  
-  static func supportedModels() -> [SupportModel] {
-    return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet]
-  }
+    case yolo               = "yolo"
+    case mobilenet_combined = "mobilenet_combined"
+    case super_resolution   = "superresoltion"
+    case mobilenet          = "mobilenet"
+    
+    static func supportedModels() -> [SupportModel] {
+        return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet]
+    }
 }
 
 let netSupport: [SupportModel : Net] = [
-  .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device),
-  .yolo : YoloNet.init(device: MetalHelper.shared.device),
-  .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device),
-  .mobilenet : MobileNet.init(device: MetalHelper.shared.device)]
+    .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device),
+    .yolo : YoloNet.init(device: MetalHelper.shared.device),
+    .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device),
+    .mobilenet : MobileNet.init(device: MetalHelper.shared.device)]
 
 class ViewController: UIViewController {
-  @IBOutlet weak var resultTextView: UITextView!
-  @IBOutlet weak var selectImageView: UIImageView!
-  @IBOutlet weak var elapsedTimeLabel: UILabel!
-  @IBOutlet weak var modelPickerView: UIPickerView!
-  @IBOutlet weak var threadPickerView: UIPickerView!
-  @IBOutlet weak var videoView: UIView!
-  //  var videoCapture: VideoCapture!
-  
-  var selectImage: UIImage?
-  var inputPointer: UnsafeMutablePointer<Float32>?
-  var modelType: SupportModel = SupportModel.supportedModels()[0]
-  var toPredictTexture: MTLTexture?
-  
-  var runner: Runner!
-  var platform: Platform = .GPU
-  var threadNum = 1
-  
-  @IBAction func loadAct(_ sender: Any) {
-    runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue)
-    if platform == .GPU {
-//      let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil)
-//      let fileReader = try! FileReader.init(paramPath: filePath!)
-//      let pointer: UnsafeMutablePointer<Float32> = fileReader.read()
-//      
-//      
-//      let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared)
-//      
-//      buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize)
-      
-      
-      if self.toPredictTexture == nil {
-        
-//        runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in
-//          self?.toPredictTexture = texture
-//        }
+    @IBOutlet weak var resultTextView: UITextView!
+    @IBOutlet weak var selectImageView: UIImageView!
+    @IBOutlet weak var elapsedTimeLabel: UILabel!
+    @IBOutlet weak var modelPickerView: UIPickerView!
+    @IBOutlet weak var threadPickerView: UIPickerView!
+    @IBOutlet weak var videoView: UIView!
+    //  var videoCapture: VideoCapture!
+    
+    var selectImage: UIImage?
+    var inputPointer: UnsafeMutablePointer<Float32>?
+    var modelType: SupportModel = SupportModel.supportedModels()[0]
+    var toPredictTexture: MTLTexture?
+    
+    var runner: Runner!
+    var platform: Platform = .GPU
+    var threadNum = 1
+    
+    @IBAction func loadAct(_ sender: Any) {
+        runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue)
+        if platform == .GPU {
+            //      let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil)
+            //      let fileReader = try! FileReader.init(paramPath: filePath!)
+            //      let pointer: UnsafeMutablePointer<Float32> = fileReader.read()
+            //      
+            //      
+            //      let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared)
+            //      
+            //      buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize)
+            
+            
+            if self.toPredictTexture == nil {
+                
+                //        runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in
+                //          self?.toPredictTexture = texture
+                //        }
+                
+                runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in
+                    self?.toPredictTexture = texture
+                }
+            }
+        } else {
+            fatalError( " unsupport " )
+        }
         
-        runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in
-          self?.toPredictTexture = texture
+        if runner.load() {
+            print(" load success ! ")
+        } else {
+            print(" load error ! ")
         }
-      }
-    } else {
-      fatalError( " unsupport " )
     }
     
-    if runner.load() {
-      print(" load success ! ")
-    } else {
-      print(" load error ! ")
+    @IBAction func selectImageAct(_ sender: Any) {
+        let imagePicker = UIImagePickerController()
+        imagePicker.sourceType = .camera
+        imagePicker.delegate = self
+        self.present(imagePicker, animated: true, completion: nil)
     }
-  }
-  
-  @IBAction func selectImageAct(_ sender: Any) {
-    let imagePicker = UIImagePickerController()
-    imagePicker.sourceType = .camera
-    imagePicker.delegate = self
-    self.present(imagePicker, animated: true, completion: nil)
-  }
-  
-  @IBAction func clearAct(_ sender: Any) {
-    runner.clear()
-  }
-  
-  @IBAction func predictAct(_ sender: Any) {
-    let max = 1
-    switch platform {
-    case .GPU:
-      guard let inTexture = toPredictTexture else {
-        resultTextView.text = "请选择图片 ! "
-        return
-      }
-      
-      let startDate = Date.init()
-      for i in 0..<max {
-        self.runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
-          guard let sSelf = self else {
-            fatalError()
-          }
-          
-          if success, let inResultHolderArr = resultHolder {
-            let inResultHolder = inResultHolderArr[0]
-            if i == max - 1 {
-              let time = Date.init().timeIntervalSince(startDate)
+    
+    @IBAction func clearAct(_ sender: Any) {
+        runner.clear()
+    }
+    
+    @IBAction func predictAct(_ sender: Any) {
+        let max = 1
+        switch platform {
+        case .GPU:
+            guard let inTexture = toPredictTexture else {
+                resultTextView.text = "请选择图片 ! "
+                return
+            }
             
-              print(inResultHolder.result.floatArr(count: inResultHolder.capacity).strideArray())
-              DispatchQueue.main.async {
-                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
-                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
-              }
+            let startDate = Date.init()
+            for i in 0..<max {
+                self.runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
+                    guard let sSelf = self else {
+                        fatalError()
+                    }
+                    
+                    if success, let inResultHolderArr = resultHolder {
+                        let inResultHolder = inResultHolderArr[0]
+                        if i == max - 1 {
+                            let time = Date.init().timeIntervalSince(startDate)
+                            
+                            print(inResultHolder.result.floatArr(count: inResultHolder.capacity).strideArray())
+                            DispatchQueue.main.async {
+                                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
+                                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+                            }
+                        }
+                    }
+                    
+                    DispatchQueue.main.async {
+                        resultHolder?.first?.releasePointer()
+                    }
+                }
             }
-          }
-          
-          DispatchQueue.main.async {
-            resultHolder?.first?.releasePointer()
-          }
         }
-      }
     }
-  }
-  
-  override func viewDidLoad() {
-    super.viewDidLoad()
     
-    modelPickerView.delegate = self
-    modelPickerView.dataSource = self
-    threadPickerView.delegate = self
-    threadPickerView.dataSource = self
-    if let image = UIImage.init(named: "classify-img-output.png") {
-      selectImage = image
-      selectImageView.image = image
-    } else {
-      print("请添加测试图片")
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        
+        GlobalConfig.shared.computePrecision = .Float16
+        GlobalConfig.shared.debug = false
+        
+        modelPickerView.delegate = self
+        modelPickerView.dataSource = self
+        threadPickerView.delegate = self
+        threadPickerView.dataSource = self
+        if let image = UIImage.init(named: "00001.jpg") {
+            selectImage = image
+            selectImageView.image = image
+        } else {
+            print("请添加测试图片")
+        }
     }
-    
-    GlobalConfig.shared.computePrecision = .Float32
-    
-    //    if platform == .CPU {
-    //      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
-    //    } else if platform == .GPU {
-    //      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
-    //        self?.toPredictTexture = texture
-    //      }
-    //    } else {
-    //      fatalError( " unsupport " )
-    //    }
-    
-    //    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
-    //    videoCapture.fps = 30
-    //    videoCapture.delegate = self
-    //    videoCapture.setUp { (success) in
-    //      DispatchQueue.main.async {
-    //        if let preViewLayer = self.videoCapture.previewLayer {
-    //          self.videoView.layer.addSublayer(preViewLayer)
-    //          self.videoCapture.previewLayer?.frame = self.videoView.bounds
-    //        }
-    //        self.videoCapture.start()
-    //      }
-    //    }
-    
-  }
 }
 
 extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
-  func numberOfComponents(in pickerView: UIPickerView) -> Int {
-    if pickerView == modelPickerView {
-      return 1
-    } else if pickerView == threadPickerView {
-      return 1
-    } else {
-      fatalError()
+    func numberOfComponents(in pickerView: UIPickerView) -> Int {
+        if pickerView == modelPickerView {
+            return 1
+        } else if pickerView == threadPickerView {
+            return 1
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
-    if pickerView == modelPickerView {
-      return SupportModel.supportedModels().count
-    } else if pickerView == threadPickerView {
-      return platformSupport.count
-    } else {
-      fatalError()
+    
+    func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
+        if pickerView == modelPickerView {
+            return SupportModel.supportedModels().count
+        } else if pickerView == threadPickerView {
+            return platformSupport.count
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
-    if pickerView == modelPickerView {
-      return SupportModel.supportedModels()[row].rawValue
-    } else if pickerView == threadPickerView {
-      return platformSupport[row].1
-    } else {
-      fatalError()
+    
+    public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
+        if pickerView == modelPickerView {
+            return SupportModel.supportedModels()[row].rawValue
+        } else if pickerView == threadPickerView {
+            return platformSupport[row].1
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
-    if pickerView == modelPickerView {
-      self.modelType = SupportModel.supportedModels()[row]
-    } else if pickerView == threadPickerView {
-      platform = platformSupport[row].0
-    } else {
-      fatalError()
+    
+    public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
+        if pickerView == modelPickerView {
+            self.modelType = SupportModel.supportedModels()[row]
+        } else if pickerView == threadPickerView {
+            platform = platformSupport[row].0
+        } else {
+            fatalError()
+        }
     }
-  }
 }
 
 extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-    picker.dismiss(animated: true){[weak self] in
-      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
-        fatalError("no image")
-      }
-      sSelf.selectImage = image
-      sSelf.selectImageView.image = image
-      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
-        sSelf.toPredictTexture = texture
-      })
+    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+        picker.dismiss(animated: true){[weak self] in
+            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
+                fatalError("no image")
+            }
+            sSelf.selectImage = image
+            sSelf.selectImageView.image = image
+            sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+                sSelf.toPredictTexture = texture
+            })
+        }
     }
-  }
 }
 
 var bool1 = false
 extension ViewController: VideoCaptureDelegate{
-  func predictTexture(texture: MTLTexture){
-    runner.scaleTexture(input: texture) { (scaledTexture) in
-      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
-        //        print(resultHolder!.result![0])
-        resultHolder?.first?.releasePointer()
-      })
+    func predictTexture(texture: MTLTexture){
+        runner.scaleTexture(input: texture) { (scaledTexture) in
+            self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
+                //        print(resultHolder!.result![0])
+                resultHolder?.first?.releasePointer()
+            })
+        }
     }
-  }
-
+    
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
index 5b7b65da7c..007fd5e429 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
@@ -326,9 +326,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
+				MTL_LANGUAGE_REVISION = Metal12;
 				SDKROOT = iphoneos;
 			};
 			name = Debug;
@@ -337,9 +338,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
+				MTL_LANGUAGE_REVISION = Metal12;
 				SDKROOT = iphoneos;
 			};
 			name = Release;
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
index 96333a07a9..ab1dcfae68 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
@@ -20,23 +20,23 @@ kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0
                       const device float4 * nscale [[buffer(0)]],
                       const device float4 * nbias [[buffer(1)]],
                       uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  float4 output = input * nscale[gid.z] + nbias[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    float4 output = input * nscale[gid.z] + nbias[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      const device half4 * newScale [[buffer(0)]],
-                      const device half4 * newBias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  half4 output = input * newScale[gid.z] + newBias[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                           const device half4 * newScale [[buffer(0)]],
+                           const device half4 * newBias [[buffer(1)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    half4 output = input * newScale[gid.z] + newBias[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
index eb94408c8a..98ba10d847 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
@@ -15,10 +15,10 @@ struct MetalConvParam {
 };
 
 kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         const device float4 *new_scale [[buffer(0)]],
-                                         const device float4 *new_biase [[buffer(1)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
+                                texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                const device float4 *new_scale [[buffer(0)]],
+                                const device float4 *new_biase [[buffer(1)]],
+                                uint3 gid [[thread_position_in_grid]]) {
     
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height() ||
@@ -32,5 +32,5 @@ kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture
     input = inTexture.sample(sample, gid.x, gid.y, gid.z);
     output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
     outTexture.write(output, gid.xy, gid.z);
-
+    
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
index a590f80898..188c31019d 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
@@ -21,29 +21,29 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                     texture2d_array<P, access::write> output [[texture(1)]],
-                     constant bilinear_interp_param & pm [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    P w = gid.x * pm.ratio_w;
-    P h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    P w1lambda = w - w0, h1lambda = h - h0;
-    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
-    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
-    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
-    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
-      + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
+                                     texture2d_array<P, access::write> output [[texture(1)]],
+                                     constant bilinear_interp_param & pm [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+    VECTOR(P, 4) r;
+    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+        r = input.read(gid.xy, gid.z);
+    } else {
+        P w = gid.x * pm.ratio_w;
+        P h = gid.y * pm.ratio_h;
+        uint w0 = w, h0 = h;
+        uint w1 = w0 + 1, h1 = h0 + 1;
+        P w1lambda = w - w0, h1lambda = h - h0;
+        P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+        if (w1 >= input.get_width()) w1 = w0;
+            if (h1 >= input.get_height()) h1 = h0;
+                VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
+                VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
+                VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
+                VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
+                r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+                + h1lambda * (w2lambda * r2 + w1lambda * r3);
+                }
+    output.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
index 394cf89db0..6104abb01d 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
@@ -16,8 +16,8 @@
 using namespace metal;
 
 struct bilinear_interp_param {
-  float ratio_h;
-  float ratio_w;
+    float ratio_h;
+    float ratio_w;
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
index 918fbac1a7..184ee2bb71 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
@@ -20,35 +20,35 @@
 #define FUNC(f, p) CONCAT2_(f, p)
 #define VECTOR(p, n) CONCAT2(p, n)
 kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
-                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
-                     texture2d_array<P, access::read> targetBox [[texture(2)]],
-                     texture2d_array<P, access::write> output[[texture(3)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
-  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
-  VECTOR(P, 4) t;
-  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
-  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
-  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
-  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
-  
-  P px = (p.x + p.z) / 2;
-  P py = (p.y + p.w) / 2;
-  P pw = p.z - p.x;
-  P ph = p.w - p.y;
-  
-  P tx = pv.x * t.x * pw + px;
-  P ty = pv.y * t.y * ph + py;
-  P tw = exp(pv.z * t.z) * pw;
-  P th = exp(pv.w * t.w) * ph;
-  
-  VECTOR(P, 4) r;
-  r.x = tx - tw / 2;
-  r.y = ty - th / 2;
-  r.z = tx + tw / 2;
-  r.w = ty + th / 2;
-
-  output.write(r, gid.xy, gid.z);
+                              texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
+                              texture2d_array<P, access::read> targetBox [[texture(2)]],
+                              texture2d_array<P, access::write> output[[texture(3)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
+    VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
+    VECTOR(P, 4) t;
+    t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
+    t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
+    t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
+    t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
+    
+    P px = (p.x + p.z) / 2;
+    P py = (p.y + p.w) / 2;
+    P pw = p.z - p.x;
+    P ph = p.w - p.y;
+    
+    P tx = pv.x * t.x * pw + px;
+    P ty = pv.y * t.y * ph + py;
+    P tw = exp(pv.z * t.z) * pw;
+    P th = exp(pv.w * t.w) * ph;
+    
+    VECTOR(P, 4) r;
+    r.x = tx - tw / 2;
+    r.y = ty - th / 2;
+    r.z = tx + tw / 2;
+    r.w = ty + th / 2;
+    
+    output.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
index 3c07872616..12450f5741 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
@@ -13,24 +13,24 @@ kernel void buffer_to_texture_kernel(
                                      const device float *input [[buffer(0)]],
                                      texture2d<float, access::write> outTexture [[texture(0)]],
                                      uint2 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  
-  float y = input[outTexture.get_width() * gid.y + gid.x];
-  outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    
+    float y = input[outTexture.get_width() * gid.y + gid.x];
+    outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
 }
 
 kernel void buffer_to_texture_kernel_half(const device float *input [[buffer(0)]],
                                           texture2d<half, access::write> outTexture [[texture(0)]],
                                           uint2 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  
-  float y = input[outTexture.get_width() * gid.y + gid.x];
-  outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    
+    float y = input[outTexture.get_width() * gid.y + gid.x];
+    outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
index 40bae035c0..099b8ca77c 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
@@ -17,104 +17,104 @@ using namespace metal;
 
 
 inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = abcd[2] = 0;
-  abcd[3] = xyzn[0] * 4 + xyzn[3];
+    abcd[0] = abcd[1] = abcd[2] = 0;
+    abcd[3] = xyzn[0] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = 0;
-  abcd[2] = xyzn[1];
-  abcd[3] = xyzn[0] * 4 + xyzn[3];
+    abcd[0] = abcd[1] = 0;
+    abcd[2] = xyzn[1];
+    abcd[3] = xyzn[0] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
-  abcd[0] = 0;
-  abcd[3] = xyzn[0];
-  abcd[2] = xyzn[1];
-  abcd[1] = xyzn[2] * 4 + xyzn[3];
+    abcd[0] = 0;
+    abcd[3] = xyzn[0];
+    abcd[2] = xyzn[1];
+    abcd[1] = xyzn[2] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
-  abcd[2] = xyzn[0];
-  abcd[1] = xyzn[1];
-  uint t = xyzn[2] * 4 + xyzn[3];
-  abcd[0] = t / C;
-  abcd[3] = t % C;
+    abcd[2] = xyzn[0];
+    abcd[1] = xyzn[1];
+    uint t = xyzn[2] * 4 + xyzn[3];
+    abcd[0] = t / C;
+    abcd[3] = t % C;
 }
 
 inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
-  xyzn[1] = xyzn[2] = 0;
-  xyzn[0] = abcd[3] / 4;
-  xyzn[1] = abcd[3] % 4;
+    xyzn[1] = xyzn[2] = 0;
+    xyzn[0] = abcd[3] / 4;
+    xyzn[1] = abcd[3] % 4;
 }
 inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
-  xyzn[2] = 0;
-  xyzn[1] = abcd[2];
-  xyzn[0] = abcd[3] / 4;
-  xyzn[3] = abcd[3] % 4;
+    xyzn[2] = 0;
+    xyzn[1] = abcd[2];
+    xyzn[0] = abcd[3] / 4;
+    xyzn[3] = abcd[3] % 4;
 }
 inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[3];
-  xyzn[1] = abcd[2];
-  xyzn[2] = abcd[1] / 4;
-  xyzn[3] = abcd[1] % 4;
+    xyzn[0] = abcd[3];
+    xyzn[1] = abcd[2];
+    xyzn[2] = abcd[1] / 4;
+    xyzn[3] = abcd[1] % 4;
 }
 inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[2];
-  xyzn[1] = abcd[1];
-  uint t = abcd[0] * C + abcd[3];
-  xyzn[2] = t / 4;
-  xyzn[3] = t % 4;
+    xyzn[0] = abcd[2];
+    xyzn[1] = abcd[1];
+    uint t = abcd[0] * C + abcd[3];
+    xyzn[2] = t / 4;
+    xyzn[3] = t % 4;
 }
 
 inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
-  abcd[2] = xyzn[0];
-  abcd[1] = xyzn[1];
-  uint t = xyzn[2] * 4 + xyzn[3];
-  abcd[0] = t / C;
-  abcd[3] = t % C;
+    abcd[2] = xyzn[0];
+    abcd[1] = xyzn[1];
+    uint t = xyzn[2] * 4 + xyzn[3];
+    abcd[0] = t / C;
+    abcd[3] = t % C;
 }
 
 inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[2];
-  xyzn[1] = abcd[1];
-  uint t = abcd[0] * C + abcd[3];
-  xyzn[2] = t / 4;
-  xyzn[3] = t % 4;
+    xyzn[0] = abcd[2];
+    xyzn[1] = abcd[1];
+    uint t = abcd[0] * C + abcd[3];
+    xyzn[2] = t / 4;
+    xyzn[3] = t % 4;
 }
 
 inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
-  int32_t r = abcd[0];
-  r = r * dim[1] + abcd[1];
-  r = r * dim[2] + abcd[2];
-  r = r * dim[3] + abcd[3];
-  return r;
+    int32_t r = abcd[0];
+    r = r * dim[1] + abcd[1];
+    r = r * dim[2] + abcd[2];
+    r = r * dim[3] + abcd[3];
+    return r;
 }
 
 inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
-  abcd[3] = ind % dim[3]; ind /= dim[3];
-  abcd[2] = ind % dim[2]; ind /= dim[2];
-  abcd[1] = ind % dim[1]; ind /= dim[1];
-  abcd[0] = ind;
+    abcd[3] = ind % dim[3]; ind /= dim[3];
+    abcd[2] = ind % dim[2]; ind /= dim[2];
+    abcd[1] = ind % dim[1]; ind /= dim[1];
+    abcd[0] = ind;
 }
 
 inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-  for (int i = 0; i < 4; i++) {
-    opos[i] = ipos[trans[i]];
-  }
+    for (int i = 0; i < 4; i++) {
+        opos[i] = ipos[trans[i]];
+    }
 }
 
 inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-  for (int i = 0; i < 4; i++) {
-    opos[trans[i]] = ipos[i];
-  }
+    for (int i = 0; i < 4; i++) {
+        opos[trans[i]] = ipos[i];
+    }
 }
 
 
 struct MetalConvParam {
-  short offsetX;
-  short offsetY;
-  short offsetZ;
-  ushort strideX;
-  ushort strideY;
-  ushort dilationX;
-  ushort dilationY;
+    short offsetX;
+    short offsetY;
+    short offsetZ;
+    ushort strideX;
+    ushort strideY;
+    ushort dilationX;
+    ushort dilationY;
 };
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
index 2b070fc48b..ff8bd3d7a3 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
@@ -42,73 +42,73 @@
 //                                     uint3 gid [[thread_position_in_grid]]) {
 //}
 kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
 #if N >= 3
-                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
 #endif
 #if N >= 4
-                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
 #endif
 #if N >= 5
-                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
 #endif
 #if N >= 6
-                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
 #endif
-                                          texture2d_array<P, access::read> inx [[texture(N)]],
-                                          texture2d_array<P, access::write> out [[texture(N+1)]],
-                                          constant ConcatParam & pm [[buffer(0)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-
-   ConcatParam cp = pm;
-   int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
-   VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
-   for (int i = 0; i < 4; i++) {
-     xyzn[3] = i;
+                                      texture2d_array<P, access::read> inx [[texture(N)]],
+                                      texture2d_array<P, access::write> out [[texture(N+1)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+    
+    ConcatParam cp = pm;
+    int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+    VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
+    for (int i = 0; i < 4; i++) {
+        xyzn[3] = i;
 #if R == 4
-     xyzn2abcd_4(cp.odim[3], xyzn, abcd);
+        xyzn2abcd_4(cp.odim[3], xyzn, abcd);
 #else
-     FUNC_R(xyzn2abcd, R)(xyzn, abcd);
+        FUNC_R(xyzn2abcd, R)(xyzn, abcd);
 #endif
-     int k = abcd[cp.axis] - cp.offset;
-     if (k < 0) continue;
-     int j = 0;
-     for (; j < N; j++) {
-       if (k < cp.vdim[j]) {
-         break;
-       }
-       k -= cp.vdim[j];
-     }
-     if (j == N) {
-       continue;
-     }
-     int ta = cp.odim[cp.axis];
-     abcd[cp.axis] = k;
-     cp.odim[cp.axis] = cp.vdim[j];
+        int k = abcd[cp.axis] - cp.offset;
+        if (k < 0) continue;
+        int j = 0;
+        for (; j < N; j++) {
+            if (k < cp.vdim[j]) {
+                break;
+            }
+            k -= cp.vdim[j];
+        }
+        if (j == N) {
+            continue;
+        }
+        int ta = cp.odim[cp.axis];
+        abcd[cp.axis] = k;
+        cp.odim[cp.axis] = cp.vdim[j];
 #if R == 4
-     abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
+        abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
 #else
-     FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
+        FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
 #endif
-     cp.odim[cp.axis] = ta;
-     switch (j) {
-       case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-       case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+        cp.odim[cp.axis] = ta;
+        switch (j) {
+            case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #if N >= 3
-       case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
 #if N >= 4
-       case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
 #if N >= 5
-       case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
 #if N >= 6
-       case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
-     }
-   }
-   out.write(r, gid.xy, gid.z);
+        }
+    }
+    out.write(r, gid.xy, gid.z);
 }
 
 #endif // V == NORMAL
@@ -117,66 +117,66 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
 
 #if V == VX
 kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
 #if N >= 3
-                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
 #endif // N >= 3
 #if N >= 4
-                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
 #endif // N >= 4
 #if N >= 5
-                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
 #endif // N >= 5
 #if N >= 6
-                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
 #endif // N >= 6
-                                          texture2d_array<P, access::write> out [[texture(N)]],
-                                          constant ConcatParam & pm [[buffer(0)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-  int x = gid.x - pm.offset;
-  if (x < 0) return;
-  if (x < pm.vdim[0]) {
-    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  x -= pm.vdim[0];
-  if (x < pm.vdim[1]) {
-    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+    int x = gid.x - pm.offset;
+    if (x < 0) return;
+    if (x < pm.vdim[0]) {
+        VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
+    x -= pm.vdim[0];
+    if (x < pm.vdim[1]) {
+        VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #if N >= 3
-  x -= pm.vdim[1];
-  if (x < pm.vdim[2]) {
-    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[1];
+    if (x < pm.vdim[2]) {
+        VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  x -= pm.vdim[2];
-  if (x < pm.vdim[3]) {
-    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[2];
+    if (x < pm.vdim[3]) {
+        VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 4
 #if N >= 5
-  x -= pm.vdim[3];
-  if (x < pm.vdim[4]) {
-    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[3];
+    if (x < pm.vdim[4]) {
+        VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 5
 #if N >= 6
-  x -= pm.vdim[4];
-  if (x < pm.vdim[5]) {
-    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[4];
+    if (x < pm.vdim[5]) {
+        VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 6
 }
 #endif // V == VX
@@ -199,50 +199,50 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
                                       texture2d_array<P, access::write> out [[texture(N)]],
                                       constant ConcatParam & pm [[buffer(0)]],
                                       uint3 gid [[thread_position_in_grid]]) {
-  int y = gid.y - pm.offset;
-  if (y < 0) return;
-  if (y < pm.vdim[0]) {
-    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  y -= pm.vdim[0];
-  if (y < pm.vdim[1]) {
-    VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    int y = gid.y - pm.offset;
+    if (y < 0) return;
+    if (y < pm.vdim[0]) {
+        VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
+    y -= pm.vdim[0];
+    if (y < pm.vdim[1]) {
+        VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #if N >= 3
-  y -= pm.vdim[1];
-  if (y < pm.vdim[2]) {
-    VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[1];
+    if (y < pm.vdim[2]) {
+        VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  y -= pm.vdim[2];
-  if (y < pm.vdim[3]) {
-    VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[2];
+    if (y < pm.vdim[3]) {
+        VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 4
 #if N >= 5
-  y -= pm.vdim[3];
-  if (y < pm.vdim[4]) {
-    VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[3];
+    if (y < pm.vdim[4]) {
+        VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 5
 #if N >= 6
-  y -= pm.vdim[4];
-  if (y < pm.vdim[5]) {
-    VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[4];
+    if (y < pm.vdim[5]) {
+        VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 6
 }
 #endif // V == VY
@@ -265,50 +265,50 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
                                       texture2d_array<P, access::write> out [[texture(N)]],
                                       constant ConcatParam & pm [[buffer(0)]],
                                       uint3 gid [[thread_position_in_grid]]) {
-  int z = gid.z - pm.offset;
-  if (z < 0) return;
-  if (z < pm.vdim[0]) {
-    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  z -= pm.vdim[0];
-  if (z < pm.vdim[1]) {
-    VECTOR(P, 4)  r = in1.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    int z = gid.z - pm.offset;
+    if (z < 0) return;
+    if (z < pm.vdim[0]) {
+        VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
+    z -= pm.vdim[0];
+    if (z < pm.vdim[1]) {
+        VECTOR(P, 4)  r = in1.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #if N >= 3
-  z -= pm.vdim[1];
-  if (z < pm.vdim[2]) {
-    VECTOR(P, 4)  r = in2.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[1];
+    if (z < pm.vdim[2]) {
+        VECTOR(P, 4)  r = in2.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  z -= pm.vdim[2];
-  if (z < pm.vdim[3]) {
-    VECTOR(P, 4)  r = in3.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[2];
+    if (z < pm.vdim[3]) {
+        VECTOR(P, 4)  r = in3.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 4
 #if N >= 5
-  z -= pm.vdim[3];
-  if (z < pm.vdim[4]) {
-    VECTOR(P, 4)  r = in4.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[3];
+    if (z < pm.vdim[4]) {
+        VECTOR(P, 4)  r = in4.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 5
 #if N >= 6
-  z -= pm.vdim[4];
-  if (z < pm.vdim[5]) {
-    VECTOR(P, 4)  r = in5.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[4];
+    if (z < pm.vdim[5]) {
+        VECTOR(P, 4)  r = in5.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 6
 }
 #endif // V == VZ
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
index b7d17f2d25..8a0390e624 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
@@ -18,11 +18,11 @@
 using namespace metal;
 
 struct ConcatParam {
-  int32_t odim[4];
-  int32_t axis;
-  int32_t offset;
-  int32_t trans[4];
-  int32_t vdim[6];
+    int32_t odim[4];
+    int32_t axis;
+    int32_t offset;
+    int32_t trans[4];
+    int32_t vdim[6];
 };
 
 #define VNORMAL 1
@@ -41,129 +41,129 @@ struct ConcatParam {
 
 // ssd-ar: (R=3, N=5, V=x)
 #define V VX
-  #define R 3
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 5
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 // ssd-ar: (R=2, N=5, V=x)
 #define V VX
-  #define R 2
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 5
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 // ssd-ar: (R=3, N=2, V=y)
 #define V VY
-  #define R 3
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 2
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 // ssd-ar: (R=4, N=3, V=z)
 #define V VZ
-  #define R 4
-    #define N 3
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 4
+#define N 3
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 // ssd: (R=2, N=6, V=y)
 #define V VY
-  #define R 2
-    #define N 6
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 6
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 // ssd: (R=3, N=6, V=y)
 #define V VY
-  #define R 3
-    #define N 6
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 6
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 #define V VNORMAL
-  #define R 4
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 4
+#define N 2
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 #define V VY
-  #define R 2
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 2
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 #define V VY
-  #define R 2
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 5
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
index 87b60a64fc..f55386096f 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
@@ -18,147 +18,147 @@ using namespace metal;
 
 
 kernel void conv_add_batch_norm_relu_1x1_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half4 *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
+                                              texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device half4 *new_scale [[buffer(3)]],
+                                              const device half4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
     
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    float4 output = float4(0.0);
     
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_add_batch_norm_relu_3x3_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half4 *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                              texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device half4 *new_scale [[buffer(3)]],
+                                              const device half4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+                                                        texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                                        constant MetalConvParam &param [[buffer(0)]],
+                                                        const device half *weights [[buffer(1)]],
+                                                        const device half4 *biase [[buffer(2)]],
+                                                        const device half4 *new_scale [[buffer(3)]],
+                                                        const device half4 *new_biase [[buffer(4)]],
+                                                        uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 
@@ -175,41 +175,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
     
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    float4 output = float4(0.0);
     
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -220,50 +220,50 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -274,37 +274,37 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
                                                    const device float4 *new_scale [[buffer(3)]],
                                                    const device float4 *new_biase [[buffer(4)]],
                                                    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
index 274e416576..e2513e1b1e 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
@@ -24,41 +24,41 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -67,66 +67,66 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 9;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  float4 input[9];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
     
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    float4 input[9];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -135,56 +135,56 @@ kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_y = param.dilationY;
-  float4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    ushort dilation_y = param.dilationY;
+    float4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
@@ -194,56 +194,56 @@ kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  float4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    float4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
@@ -253,297 +253,297 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
                                    const device float *weights [[buffer(1)]],
                                    const device float4 *biase [[buffer(2)]],
                                    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = biase[gid.z];
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = biase[gid.z];
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 #pragma mark - half
 
 kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(biase[gid.z]);
+    
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i));
+        float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size  + i]);
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size  + i]);
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size  + i]);
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
+        output.w += dot(input, weight_w);
+    }
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
+    }
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                   constant MetalConvParam &param [[buffer(0)]],
-                                   const device half *weights [[buffer(1)]],
-                                   const device half4 *biase [[buffer(2)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  half4 output = biase[gid.z];
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                        constant MetalConvParam &param [[buffer(0)]],
+                                        const device half *weights [[buffer(1)]],
+                                        const device half4 *biase [[buffer(2)]],
+                                        uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    half4 output = biase[gid.z];
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_y = param.dilationY;
-  half4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = biase[gid.z];
+    
+    ushort dilation_y = param.dilationY;
+    half4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  half4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    half4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
@@ -553,69 +553,69 @@ kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [
                               const device float4 *weights [[buffer(1)]],
                               const device float4 *biase [[buffer(2)]],
                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 9;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  float4 input[9];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
     
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    float4 input[9];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  //  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
index 069daa20e8..e2b8834cc5 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
@@ -19,428 +19,428 @@
 
 #pragma mark - convAdd
 kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-                         const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  VECTOR(P, 4) output = biase[gid.z];
-  
-  VECTOR(P, 4) input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
-    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  
-//  output = output + float4(biase[gid.z]);
-  
+                                                uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];
+    
+    VECTOR(P, 4) input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
+        VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    
+    //  output = output + float4(biase[gid.z]);
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-    texture2d_array<P, access::write> outTexture [[texture(1)]],
-    constant MetalConvParam &param [[buffer(0)]],
-    const device VECTOR(P, 4) *weights [[buffer(1)]],
-    const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-     const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-     uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 9;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];
-
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-
-  VECTOR(P, 4) input[9];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-
-    for (int j = 0; j < 9; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                                uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    VECTOR(P, 4) input[9];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        
+        for (int j = 0; j < 9; ++j) {
+            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  
+    //  output = output + float4(biase[gid.z]);
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-                        const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-                         uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 5;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];;
-
-  ushort dilation_y = param.dilationY;
-  VECTOR(P, 4) input[5];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-
-    for (int j = 0; j < 5; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                                uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];;
+    
+    ushort dilation_y = param.dilationY;
+    VECTOR(P, 4) input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 
 kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-                         const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-                         uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 5;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];
-
-  ushort dilation_x = param.dilationX;
-  VECTOR(P, 4) input[5];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-
-    for (int j = 0; j < 5; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                                uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    VECTOR(P, 4) input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-    texture2d_array<P, access::write> outTexture [[texture(1)]],
-    constant MetalConvParam &param [[buffer(0)]],
-    const device P *weights [[buffer(1)]],
-    const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                          texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                          constant MetalConvParam &param [[buffer(0)]],
+                                                          const device P *weights [[buffer(1)]],
+                                                          const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                          const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                          const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-    const device P *alpha [[buffer(3)]],
+                                                          const device P *alpha [[buffer(3)]],
 #endif
-    uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  VECTOR(P, 4) output = biase[gid.z];
-  VECTOR(P, 4) inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    VECTOR(P, 4) input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  
+                                                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    VECTOR(P, 4) output = biase[gid.z];
+    VECTOR(P, 4) inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        VECTOR(P, 4) input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
index f03a1d5b62..407b8385b7 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
@@ -18,45 +18,45 @@ using namespace metal;
 
 #define P float
 
-  #define PRELU_CHANNEL prelu_channel
-  #define PRELU_TYPE prelu_channel
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_CHANNEL
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE prelu_channel
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
 
-  #define PRELU_ELEMENT prelu_element
-  #define PRELU_TYPE prelu_element
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_ELEMENT
+#define PRELU_ELEMENT prelu_element
+#define PRELU_TYPE prelu_element
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
 
-  #define PRELU_OTHER   prelu_other
-  #define PRELU_TYPE prelu_other
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_OTHER
+#define PRELU_OTHER   prelu_other
+#define PRELU_TYPE prelu_other
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
 
 #undef P
 
 #define P half
 
-  #define PRELU_CHANNEL prelu_channel
-  #define PRELU_TYPE prelu_channel
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_CHANNEL
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE prelu_channel
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
 
-  #define PRELU_ELEMENT prelu_element
-  #define PRELU_TYPE prelu_element
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_ELEMENT
+#define PRELU_ELEMENT prelu_element
+#define PRELU_TYPE prelu_element
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
 
-  #define PRELU_OTHER   prelu_other
-  #define PRELU_TYPE prelu_other
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_OTHER
+#define PRELU_OTHER   prelu_other
+#define PRELU_TYPE prelu_other
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
 
 #undef P
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
index 4b97b7829a..6851f8aa98 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
@@ -25,41 +25,41 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTe
                                      const device float4 *new_scale [[buffer(2)]],
                                      const device float4 *new_biase [[buffer(3)]],
                                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
     
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
     
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    float4 output = float4(0.0);
+    
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -69,50 +69,50 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTe
                                      const device float4 *new_scale [[buffer(2)]],
                                      const device float4 *new_biase [[buffer(3)]],
                                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -122,176 +122,176 @@ kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sa
                                                const device float4 *new_scale [[buffer(2)]],
                                                const device float4 *new_biase [[buffer(3)]],
                                                uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 #pragma mark - half
 kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device half4 *weights [[buffer(1)]],
-                                     const device half4 *new_scale [[buffer(2)]],
-                                     const device half4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(float4(input), float4(weight_x));
+                                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                          constant MetalConvParam &param [[buffer(0)]],
+                                          const device half4 *weights [[buffer(1)]],
+                                          const device half4 *new_scale [[buffer(2)]],
+                                          const device half4 *new_biase [[buffer(3)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
     
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(float4(input), float4(weight_y));
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(float4(input), float4(weight_z));
+    float4 output = float4(0.0);
     
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(float4(input), float4(weight_w));
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(float4(input), float4(weight_x));
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(float4(input), float4(weight_y));
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(float4(input), float4(weight_z));
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(float4(input), float4(weight_w));
+    }
+    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device half4 *weights [[buffer(1)]],
-                                     const device half4 *new_scale [[buffer(2)]],
-                                     const device half4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
+                                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                          constant MetalConvParam &param [[buffer(0)]],
+                                          const device half4 *weights [[buffer(1)]],
+                                          const device half4 *new_scale [[buffer(2)]],
+                                          const device half4 *new_biase [[buffer(3)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
+    }
+    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                               constant MetalConvParam &param [[buffer(0)]],
-                                               const device half *weights [[buffer(1)]],
-                                               const device half4 *new_scale [[buffer(2)]],
-                                               const device half4 *new_biase [[buffer(3)]],
-                                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+                                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                                    constant MetalConvParam &param [[buffer(0)]],
+                                                    const device half *weights [[buffer(1)]],
+                                                    const device half4 *new_scale [[buffer(2)]],
+                                                    const device half4 *new_biase [[buffer(3)]],
+                                                    uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
index c07515c13d..c7b3f792d6 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
@@ -23,49 +23,49 @@ kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(
                      constant MetalConvParam &param [[buffer(0)]],
                      const device float4 *weights [[buffer(1)]],
                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -73,37 +73,37 @@ kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture
                                constant MetalConvParam &param [[buffer(0)]],
                                const device float *weights [[buffer(1)]],
                                uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -111,170 +111,170 @@ kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(
                      constant MetalConvParam &param [[buffer(0)]],
                      const device float4 *weights [[buffer(1)]],
                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
     
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    float4 output = float4(0.0);
     
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  outTexture.write(output, gid.xy, gid.z);
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device half4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          constant MetalConvParam &param [[buffer(0)]],
+                          const device half4 *weights [[buffer(1)]],
+                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
     }
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                               constant MetalConvParam &param [[buffer(0)]],
-                               const device half *weights [[buffer(1)]],
-                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
-    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
-    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
-    output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
+                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                    constant MetalConvParam &param [[buffer(0)]],
+                                    const device half *weights [[buffer(1)]],
+                                    uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+        output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+        output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+        output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+    }
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device half4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(float4(input), float4(weight_x));
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          constant MetalConvParam &param [[buffer(0)]],
+                          const device half4 *weights [[buffer(1)]],
+                          uint3 gid [[thread_position_in_grid]]) {
     
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(float4(input), float4(weight_y));
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
     
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(float4(input), float4(weight_z));
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
     
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(float4(input), float4(weight_w));
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(float4(input), float4(weight_x));
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(float4(input), float4(weight_y));
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(float4(input), float4(weight_z));
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(float4(input), float4(weight_w));
+    }
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
index baf3f31157..a324fac188 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
@@ -16,17 +16,17 @@
 using namespace metal;
 
 struct MetalConvTransposeParam{
-  ushort kernelW;
-  ushort kernelH;
-  
-  ushort strideX;
-  ushort strideY;
-  
-  ushort paddingX;
-  ushort paddingY;
-  
-  ushort dilationX;
-  ushort dilationY;
+    ushort kernelW;
+    ushort kernelH;
+    
+    ushort strideX;
+    ushort strideY;
+    
+    ushort paddingX;
+    ushort paddingY;
+    
+    ushort dilationX;
+    ushort dilationY;
 };
 
 kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -34,83 +34,83 @@ kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inT
                                       constant MetalConvTransposeParam &param [[buffer(0)]],
                                       const device float4 *weights [[buffer(1)]],
                                       uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_array_size = inTexture.get_array_size();
-  int kernel_index_x = gid.x % 2;
-  int kernel_index_y = gid.y % 2;
-  int kernel_index = kernel_index_y * 2 + kernel_index_x;
-  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-  int input_x = gid.x / 2;
-  int input_y = gid.y / 2;
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 output = float4(0.0);
-  for (int i = 0; i < input_array_size; ++i) {
-    
-    float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-    
-    float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-    float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-    float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-    float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-    
-    output.x += dot(input, kernel_slice0);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
     
-    output.y += dot(input, kernel_slice1);
+    int input_array_size = inTexture.get_array_size();
+    int kernel_index_x = gid.x % 2;
+    int kernel_index_y = gid.y % 2;
+    int kernel_index = kernel_index_y * 2 + kernel_index_x;
+    int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+    int input_x = gid.x / 2;
+    int input_y = gid.y / 2;
     
-    output.z += dot(input, kernel_slice2);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 output = float4(0.0);
+    for (int i = 0; i < input_array_size; ++i) {
+        
+        float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+        
+        float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+        float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+        float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+        float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+        
+        output.x += dot(input, kernel_slice0);
+        
+        output.y += dot(input, kernel_slice1);
+        
+        output.z += dot(input, kernel_slice2);
+        
+        output.w += dot(input, kernel_slice3);
+    }
     
-    output.w += dot(input, kernel_slice3);
-  }
-  
-  outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                      constant MetalConvTransposeParam &param [[buffer(0)]],
-                                      const device half4 *weights [[buffer(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_array_size = inTexture.get_array_size();
-  int kernel_index_x = gid.x % 2;
-  int kernel_index_y = gid.y % 2;
-  int kernel_index = kernel_index_y * 2 + kernel_index_x;
-  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-  int input_x = gid.x / 2;
-  int input_y = gid.y / 2;
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 output = float4(0.0);
-  for (int i = 0; i < input_array_size; ++i) {
-    
-    half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-    
-    half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-    half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-    half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-    half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-    
-    output.x += dot(float4(input), float4(kernel_slice0));
+                                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                           constant MetalConvTransposeParam &param [[buffer(0)]],
+                                           const device half4 *weights [[buffer(1)]],
+                                           uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
     
-    output.y += dot(float4(input), float4(kernel_slice1));
+    int input_array_size = inTexture.get_array_size();
+    int kernel_index_x = gid.x % 2;
+    int kernel_index_y = gid.y % 2;
+    int kernel_index = kernel_index_y * 2 + kernel_index_x;
+    int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+    int input_x = gid.x / 2;
+    int input_y = gid.y / 2;
     
-    output.z += dot(float4(input), float4(kernel_slice2));
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 output = float4(0.0);
+    for (int i = 0; i < input_array_size; ++i) {
+        
+        half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+        
+        half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+        half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+        half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+        half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+        
+        output.x += dot(float4(input), float4(kernel_slice0));
+        
+        output.y += dot(float4(input), float4(kernel_slice1));
+        
+        output.z += dot(float4(input), float4(kernel_slice2));
+        
+        output.w += dot(float4(input), float4(kernel_slice3));
+    }
     
-    output.w += dot(float4(input), float4(kernel_slice3));
-  }
-  
-  outTexture.write(half4(output), gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 //kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
index b152df8281..40cad28df1 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
@@ -18,13 +18,13 @@
 using namespace metal;
 
 struct ElementwiseAddParam {
-  int32_t fast;
-  int32_t axis;
-  int32_t ylen;
-  int32_t xdim[4];
-  int32_t xtrans[4];
-  int32_t ydim[4];
-  int32_t ytrans[4];
+    int32_t fast;
+    int32_t axis;
+    int32_t ylen;
+    int32_t xdim[4];
+    int32_t xtrans[4];
+    int32_t ydim[4];
+    int32_t ytrans[4];
 };
 
 kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
@@ -32,69 +32,69 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
                             texture2d_array<float, access::write> outTexture [[texture(2)]],
                             constant ElementwiseAddParam &pm [[buffer(0)]],
                             uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  float4 rx, ry;
-
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
-  } else {
-    rx = inputX.read(gid.xy, gid.z);
-    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-    int32_t yshift = 4 - pm.ylen - pm.axis;
-    for (int n = 0; n < 4; n++) {
-      x_xyzn[3] = n;
-      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[yshift+k] = t_abcd[k];
-      }
-      trans(ytrans, y_abcd, t_abcd);
-      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    float4 rx, ry;
+    
+    if (pm.fast == 1) {
+        rx = inputX.read(gid.xy, gid.z);
+        ry = inputY.read(gid.xy, gid.z);
+    } else {
+        rx = inputX.read(gid.xy, gid.z);
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+        int32_t yshift = 4 - pm.ylen - pm.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+        }
     }
-  }
-  float4 r = rx + ry;
-  outTexture.write(r, gid.xy, gid.z);
+    float4 r = rx + ry;
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
-                            texture2d_array<half, access::read> inputY [[texture(1)]],
-                            texture2d_array<half, access::write> outTexture [[texture(2)]],
-                            constant ElementwiseAddParam &pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  half4 rx, ry;
-
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
-  } else {
-    rx = inputX.read(gid.xy, gid.z);
-    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-    int32_t yshift = 4 - pm.ylen - pm.axis;
-    for (int n = 0; n < 4; n++) {
-      x_xyzn[3] = n;
-      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[yshift+k] = t_abcd[k];
-      }
-      trans(ytrans, y_abcd, t_abcd);
-      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+                                 texture2d_array<half, access::read> inputY [[texture(1)]],
+                                 texture2d_array<half, access::write> outTexture [[texture(2)]],
+                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    half4 rx, ry;
+    
+    if (pm.fast == 1) {
+        rx = inputX.read(gid.xy, gid.z);
+        ry = inputY.read(gid.xy, gid.z);
+    } else {
+        rx = inputX.read(gid.xy, gid.z);
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+        int32_t yshift = 4 - pm.ylen - pm.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+        }
     }
-  }
-  half4 r = rx + ry;
-  outTexture.write(r, gid.xy, gid.z);
+    half4 r = rx + ry;
+    outTexture.write(r, gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
index b1d68d6809..65566952ef 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
@@ -20,72 +20,72 @@
 using namespace metal;
 
 kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
-                                 texture2d_array<P, access::read> inputY [[texture(1)]],
-                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
-                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+                                                   texture2d_array<P, access::read> inputY [[texture(1)]],
+                                                   texture2d_array<P, access::write> outTexture [[texture(2)]],
+                                                   constant ElementwiseAddParam &pm [[buffer(0)]],
 #ifdef PRELU_CHANNEL
-                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+                                                   const device VECTOR(P, 4) *alpha [[buffer(1)]],
 #endif
 #ifdef PRELU_ELEMENT
-                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+                                                   const device VECTOR(P, 4) *alpha [[buffer(1)]],
 #endif
 #ifdef PRELU_OTHER
-                                 const device P *alpha [[buffer(1)]],
+                                                   const device P *alpha [[buffer(1)]],
 #endif
-                                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  VECTOR(P, 4) rx, ry;
-  
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
+                                                   uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    VECTOR(P, 4) rx, ry;
+    
+    if (pm.fast == 1) {
+        rx = inputX.read(gid.xy, gid.z);
+        ry = inputY.read(gid.xy, gid.z);
     } else {
-      rx = inputX.read(gid.xy, gid.z);
-      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-      int32_t yshift = 4 - pm.ylen - pm.axis;
-      for (int n = 0; n < 4; n++) {
-        x_xyzn[3] = n;
-        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-        invtrans(xtrans, x_abcd, t_abcd);
-        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-          y_abcd[yshift+k] = t_abcd[k];
+        rx = inputX.read(gid.xy, gid.z);
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+        int32_t yshift = 4 - pm.ylen - pm.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
         }
-        trans(ytrans, y_abcd, t_abcd);
-        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-      }
-  }
-  VECTOR(P, 4) output = rx + ry;
-  
+    }
+    VECTOR(P, 4) output = rx + ry;
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  
-  outTexture.write(output, gid.xy, gid.z);
+    
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
index 8fd1a9fdab..cca11e8086 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
@@ -17,13 +17,13 @@
 using namespace metal;
 
 struct ElementwiseAddParam {
-  int32_t fast;
-  int32_t axis;
-  int32_t ylen;
-  int32_t xdim[4];
-  int32_t xtrans[4];
-  int32_t ydim[4];
-  int32_t ytrans[4];
+    int32_t fast;
+    int32_t axis;
+    int32_t ylen;
+    int32_t xdim[4];
+    int32_t xtrans[4];
+    int32_t ydim[4];
+    int32_t ytrans[4];
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
index 9655b0fc1a..114aa15664 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
@@ -23,38 +23,38 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                  device float *output [[buffer(0)]],
-                  uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-
-  int input_width = inTexture.get_width();
-  int input_height = inTexture.get_height();
-  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-  int output_to = 4 * input_width * input_height;
-  
-  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
-  
-  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
-  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
-  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    int input_height = inTexture.get_height();
+    const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
+    int output_to = 4 * input_width * input_height;
+    
+    output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+    
+    output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+    output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+    output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
 }
 
 kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                             device float4 *output [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = float4(input);
+                                 device float4 *output [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = float4(input);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
index 87d304302f..df2de98648 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
@@ -31,7 +31,7 @@ using namespace metal;
 kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
                               device float *output [[buffer(0)]],
                               uint3 gid [[thread_position_in_grid]]) {
-  
+    
 }
 
 kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
index 368509f001..06bf42697e 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
@@ -23,47 +23,47 @@ kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
 }
 
 struct OutputDim {
-  ushort width;
-  ushort height;
-  ushort strideX;
-  ushort strideY;
+    ushort width;
+    ushort height;
+    ushort strideX;
+    ushort strideY;
 };
 
 kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
                    texture2d_array<half, access::write> outTexture [[texture(1)]],
                    constant OutputDim &params [[buffer(0)]],
                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
-  const half4 input = inTexture.read(pos);
-  outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    
+    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
+    const half4 input = inTexture.read(pos);
+    outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
 }
 
 
 kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
                                   uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height()){
-    return;
-  }
-  const float4 input = inTexture.read(gid.xy);
-  outTexture.write(input, gid.xy, 0);
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height()){
+        return;
+    }
+    const float4 input = inTexture.read(gid.xy);
+    outTexture.write(input, gid.xy, 0);
 }
 
 kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
-                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height()){
-    return;
-  }
-  const half4 input = inTexture.read(gid.xy);
-  outTexture.write(input, gid.xy, 0);
+                                       texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                       uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height()){
+        return;
+    }
+    const half4 input = inTexture.read(gid.xy);
+    outTexture.write(input, gid.xy, 0);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
index 44c57440e1..e32c98cc29 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
@@ -16,65 +16,65 @@
 using namespace metal;
 
 kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
-    device float *output [[buffer(0)]],
-    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input.x;
-  
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = input.x;
+    
 }
 
 
 kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                             device float *output [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input.x;
+                                  device float *output [[buffer(0)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = input.x;
 }
 
 kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
-    device float4 *output [[buffer(0)]],
-    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-//  int input_height = inTexture.get_height();
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input;
+                           device float4 *output [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    //  int input_height = inTexture.get_height();
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = input;
 }
 
 kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           device float4 *output [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-//  int input_height = inTexture.get_height();
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = float4(input);
+                                device float4 *output [[buffer(0)]],
+                                uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    //  int input_height = inTexture.get_height();
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = float4(input);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
index 3c36ba06f5..05146b8d14 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
@@ -15,36 +15,36 @@
 #ifdef P
 
 kernel void FUNC2_(pool, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                 texture2d_array<P, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  int xmin = gid.x * pm.strideX - pm.paddingX;
-  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-  xmin = max(xmin, 0);
-  int ymin = gid.y * pm.strideX - pm.paddingX;
-  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-  ymin = max(ymin, 0);
-  
-  VECTOR(P, 4) r = 0;
-  if (pm.poolType == 0) {
-    r = inTexture.read(uint2(xmin, ymin), gid.z);
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-      }
+                            texture2d_array<P, access::write> outTexture [[texture(1)]],
+                            constant PoolParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    int xmin = gid.x * pm.strideX - pm.paddingX;
+    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+    xmin = max(xmin, 0);
+    int ymin = gid.y * pm.strideX - pm.paddingX;
+    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+    ymin = max(ymin, 0);
+    
+    VECTOR(P, 4) r = 0;
+    if (pm.poolType == 0) {
+        r = inTexture.read(uint2(xmin, ymin), gid.z);
+        for (int x = xmin; x < xmax; x++) {
+            for (int y = ymin; y < ymax; y++) {
+                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+            }
+        }
+    } else if (pm.poolType == 1) {
+        for (int x = xmin; x < xmax; x++) {
+            for (int y = ymin; y < ymax; y++) {
+                r += inTexture.read(uint2(x, y), gid.z);
+            }
+        }
+        r /= (xmax - xmin) * (ymax - ymin);
     }
-  } else if (pm.poolType == 1) {
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r += inTexture.read(uint2(x, y), gid.z);
-      }
-    }
-    r /= (xmax - xmin) * (ymax - ymin);
-  }
-  outTexture.write(r, gid.xy, gid.z);
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
index e76b4ac742..30111b7bcb 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
@@ -18,13 +18,13 @@
 using namespace metal;
 
 struct PoolParam {
-  int ksizeX;
-  int ksizeY;
-  int strideX;
-  int strideY;
-  int paddingX;
-  int paddingY;
-  int poolType;
+    int ksizeX;
+    int ksizeY;
+    int strideX;
+    int strideY;
+    int paddingX;
+    int paddingY;
+    int poolType;
 };
 
 #define P half
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
index 5978041377..6279821436 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
@@ -16,136 +16,136 @@
 using namespace metal;
 
 kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-                           const device float4 *alpha [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  float4 alpha_value = alpha[gid.z];
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    float4 alpha_value = alpha[gid.z];
+    float4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                           texture2d_array<float, access::write> outTexture [[texture(1)]],
                           const device float4 *alpha [[buffer(0)]],
                           uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-
-  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-  float4 alpha_value = alpha[alpha_to + gid.z];
-
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    
+    int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+    float4 alpha_value = alpha[alpha_to + gid.z];
+    
+    float4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<float, access::write> outTexture [[texture(1)]],
-                          const device float *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  float alpha_value = alpha[0];
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                        texture2d_array<float, access::write> outTexture [[texture(1)]],
+                        const device float *alpha [[buffer(0)]],
+                        uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    float alpha_value = alpha[0];
+    float4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          const device half4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  half4 alpha_value = alpha[gid.z];
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               const device half4 *alpha [[buffer(0)]],
+                               uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    half4 alpha_value = alpha[gid.z];
+    half4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          const device half4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  
-  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-  half4 alpha_value = alpha[alpha_to + gid.z];
-  
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               const device half4 *alpha [[buffer(0)]],
+                               uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    
+    int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+    half4 alpha_value = alpha[alpha_to + gid.z];
+    
+    half4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                        texture2d_array<half, access::write> outTexture [[texture(1)]],
-                        const device half *alpha [[buffer(0)]],
-                        uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  half alpha_value = alpha[0];
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                             texture2d_array<half, access::write> outTexture [[texture(1)]],
+                             const device half *alpha [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    half alpha_value = alpha[0];
+    half4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
index 7630febf77..c7f97043bf 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
@@ -16,20 +16,20 @@
 using namespace metal;
 
 struct PriorBoxMetalParam {
-  float offset;
-  float stepWidth;
-  float stepHeight;
-  float minSize;
-  float maxSize;
-  float imageWidth;
-  float imageHeight;
-  
-  bool clip;
-  
-  uint numPriors;
-  uint aspecRatiosSize;
-  uint minSizeSize;
-  uint maxSizeSize;
+    float offset;
+    float stepWidth;
+    float stepHeight;
+    float minSize;
+    float maxSize;
+    float imageWidth;
+    float imageHeight;
+    
+    bool clip;
+    
+    uint numPriors;
+    uint aspecRatiosSize;
+    uint minSizeSize;
+    uint maxSizeSize;
 };
 
 kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
@@ -39,329 +39,329 @@ kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0
                       constant PriorBoxMetalParam &param [[buffer(1)]],
                       const device float4 *variances [[buffer(2)]],
                       uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  if (gid.z < param.aspecRatiosSize) {
-    float ar = aspect_ratios[gid.z];
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
+    
+    float box_width, box_height;
+    
+    if (gid.z < param.aspecRatiosSize) {
+        float ar = aspect_ratios[gid.z];
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(res, gid.xy, gid.z);
+    } else if (gid.z >= param.aspecRatiosSize) {
+        if (param.maxSizeSize > 0) {
+            box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+            float4 max_box;
+            max_box.x = (center_x - box_width) / param.imageWidth;
+            max_box.y = (center_y - box_height) / param.imageHeight;
+            max_box.z = (center_x + box_width) / param.imageWidth;
+            max_box.w = (center_y + box_height) / param.imageHeight;
+            
+            float4 res;
+            if (param.clip) {
+                res = min(max(max_box, 0.0), 1.0);
+            } else {
+                res = max_box;
+            }
+            outBoxTexture.write(max_box, gid.xy, gid.z);
+        }
     }
     
-    outBoxTexture.write(res, gid.xy, gid.z);
-  } else if (gid.z >= param.aspecRatiosSize) {
-    if (param.maxSizeSize > 0) {
-      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-      float4 max_box;
-      max_box.x = (center_x - box_width) / param.imageWidth;
-      max_box.y = (center_y - box_height) / param.imageHeight;
-      max_box.z = (center_x + box_width) / param.imageWidth;
-      max_box.w = (center_y + box_height) / param.imageHeight;
-
-      float4 res;
-      if (param.clip) {
-        res = min(max(max_box, 0.0), 1.0);
-      } else {
-        res = max_box;
-      }
-      outBoxTexture.write(max_box, gid.xy, gid.z);
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(variances_output, gid.xy, gid.z);
     }
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(variances_output, gid.xy, gid.z);
-  }
 }
 
 
 kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                      const device half *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  if (gid.z < param.aspecRatiosSize) {
-    half ar = aspect_ratios[gid.z];
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                           const device half *aspect_ratios [[buffer(0)]],
+                           constant PriorBoxMetalParam &param [[buffer(1)]],
+                           const device float4 *variances [[buffer(2)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
+    
+    float box_width, box_height;
+    
+    if (gid.z < param.aspecRatiosSize) {
+        half ar = aspect_ratios[gid.z];
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
+    } else if (gid.z >= param.aspecRatiosSize) {
+        if (param.maxSizeSize > 0) {
+            box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+            float4 max_box;
+            max_box.x = (center_x - box_width) / param.imageWidth;
+            max_box.y = (center_y - box_height) / param.imageHeight;
+            max_box.z = (center_x + box_width) / param.imageWidth;
+            max_box.w = (center_y + box_height) / param.imageHeight;
+            
+            float4 res;
+            if (param.clip) {
+                res = min(max(max_box, 0.0), 1.0);
+            } else {
+                res = max_box;
+            }
+            outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+        }
     }
     
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  } else if (gid.z >= param.aspecRatiosSize) {
-    if (param.maxSizeSize > 0) {
-      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-      float4 max_box;
-      max_box.x = (center_x - box_width) / param.imageWidth;
-      max_box.y = (center_y - box_height) / param.imageHeight;
-      max_box.z = (center_x + box_width) / param.imageWidth;
-      max_box.w = (center_y + box_height) / param.imageHeight;
-      
-      float4 res;
-      if (param.clip) {
-        res = min(max(max_box, 0.0), 1.0);
-      } else {
-        res = max_box;
-      }
-      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(half4(variances_output), gid.xy, gid.z);
     }
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-  }
 }
 
 
 
 kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
-                      const device float *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  
-  
-  if (gid.z == 0) {
-    box_width = box_height = param.minSize / 2;
+                                              texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                                              texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                                              const device float *aspect_ratios [[buffer(0)]],
+                                              constant PriorBoxMetalParam &param [[buffer(1)]],
+                                              const device float4 *variances [[buffer(2)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    float box_width, box_height;
+    
+    
+    
+    if (gid.z == 0) {
+        box_width = box_height = param.minSize / 2;
+        
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(res, gid.xy, gid.z);
     }
-
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  if (gid.z == 1 && param.maxSizeSize > 0) {
     
-    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-    float4 max_box;
-    max_box.x = (center_x - box_width) / param.imageWidth;
-    max_box.y = (center_y - box_height) / param.imageHeight;
-    max_box.z = (center_x + box_width) / param.imageWidth;
-    max_box.w = (center_y + box_height) / param.imageHeight;
+    if (gid.z == 1 && param.maxSizeSize > 0) {
+        
+        box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+        float4 max_box;
+        max_box.x = (center_x - box_width) / param.imageWidth;
+        max_box.y = (center_y - box_height) / param.imageHeight;
+        max_box.z = (center_x + box_width) / param.imageWidth;
+        max_box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = min(max(max_box, 0.0), 1.0);
+        } else {
+            res = max_box;
+        }
+        outBoxTexture.write(res, gid.xy, gid.z);
+    }
     
-    float4 res;
-    if (param.clip) {
-      res = min(max(max_box, 0.0), 1.0);
+    int aspect_to = 0;
+    if (param.maxSizeSize > 0) {
+        aspect_to = gid.z - 2;
     } else {
-      res = max_box;
+        aspect_to = gid.z - 1;
     }
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  int aspect_to = 0;
-  if (param.maxSizeSize > 0) {
-    aspect_to = gid.z - 2;
-  } else {
-    aspect_to = gid.z - 1;
-  }
-  
-
-  
-  
-  if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
     
-    int skip = 0;
-    for (int i = 0; i < aspect_to + 1; ++i) {
-      if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
-        skip += 1;
-      }
-    }
-    aspect_to += skip;
     
-    float ar = aspect_ratios[aspect_to];
     
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
+        
+        int skip = 0;
+        for (int i = 0; i < aspect_to + 1; ++i) {
+            if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
+                skip += 1;
+            }
+        }
+        aspect_to += skip;
+        
+        float ar = aspect_ratios[aspect_to];
+        
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(res, gid.xy, gid.z);
     }
     
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(variances_output, gid.xy, gid.z);
-  }
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(variances_output, gid.xy, gid.z);
+    }
 }
 
 
 kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                           const device half *aspect_ratios [[buffer(0)]],
-                           constant PriorBoxMetalParam &param [[buffer(1)]],
-                           const device float4 *variances [[buffer(2)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  
-  
-  if (gid.z == 0) {
-    box_width = box_height = param.minSize / 2;
+                                                   texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                                                   texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                                                   const device half *aspect_ratios [[buffer(0)]],
+                                                   constant PriorBoxMetalParam &param [[buffer(1)]],
+                                                   const device float4 *variances [[buffer(2)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
+    float box_width, box_height;
     
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  if (gid.z == 1 && param.maxSizeSize > 0) {
     
-    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-    float4 max_box;
-    max_box.x = (center_x - box_width) / param.imageWidth;
-    max_box.y = (center_y - box_height) / param.imageHeight;
-    max_box.z = (center_x + box_width) / param.imageWidth;
-    max_box.w = (center_y + box_height) / param.imageHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = min(max(max_box, 0.0), 1.0);
-    } else {
-      res = max_box;
+    if (gid.z == 0) {
+        box_width = box_height = param.minSize / 2;
+        
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
     }
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  int aspect_to = 0;
-  if (param.maxSizeSize > 0) {
-    aspect_to = gid.z - 2;
-  } else {
-    aspect_to = gid.z - 1;
-  }
-  
-  if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
-    float ar = aspect_ratios[aspect_to];
     
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    if (gid.z == 1 && param.maxSizeSize > 0) {
+        
+        box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+        float4 max_box;
+        max_box.x = (center_x - box_width) / param.imageWidth;
+        max_box.y = (center_y - box_height) / param.imageHeight;
+        max_box.z = (center_x + box_width) / param.imageWidth;
+        max_box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = min(max(max_box, 0.0), 1.0);
+        } else {
+            res = max_box;
+        }
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
+    }
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
+    int aspect_to = 0;
+    if (param.maxSizeSize > 0) {
+        aspect_to = gid.z - 2;
     } else {
-      res = box;
+        aspect_to = gid.z - 1;
     }
     
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-  }
+    if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
+        float ar = aspect_ratios[aspect_to];
+        
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
+    }
+    
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+    }
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
index e725440bbe..725222d75e 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
@@ -17,25 +17,25 @@ using namespace metal;
 
 
 kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  const float4 relu = fmax((float4)input, 0.0);
-  outTexture.write(half4(relu), gid.xy, gid.z);
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    const float4 relu = fmax((float4)input, 0.0);
+    outTexture.write(half4(relu), gid.xy, gid.z);
 }
 
 kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  const float4 relu = fmax((float4)input, 0.0);
-  outTexture.write(float4(relu), gid.xy, gid.z);
+                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    const float4 relu = fmax((float4)input, 0.0);
+    outTexture.write(float4(relu), gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
index 7583537c2b..3037e404a3 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
@@ -24,43 +24,43 @@
 #define FUNC_R(f, r) CONCAT2_(f, r)
 
 kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                    texture2d_array<P, access::write> outTexture [[texture(1)]],
-                    constant ReshapeParam &rp [[buffer(0)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-
-  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
-  ReshapeParam lrp = rp;
-  int oC = lrp.odim[lrp.otrans[3]];
-  int iC = lrp.idim[lrp.itrans[3]];
-  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
-  VECTOR(P, 4) r;
-  for (int n = 0; n < 4; n++) {
-    oxyzn[3] = n;
+                                        texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                        constant ReshapeParam &rp [[buffer(0)]],
+                                        uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    
+    int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
+    ReshapeParam lrp = rp;
+    int oC = lrp.odim[lrp.otrans[3]];
+    int iC = lrp.idim[lrp.itrans[3]];
+    int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
+    VECTOR(P, 4) r;
+    for (int n = 0; n < 4; n++) {
+        oxyzn[3] = n;
 #if ROUT == 4
-    xyzn2abcd_4(oC, oxyzn, oabcd);
+        xyzn2abcd_4(oC, oxyzn, oabcd);
 #else
-    FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
+        FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
 #endif
-    int tabcd[4];
-    invtrans(lrp.otrans, oabcd, tabcd);
-    int index = abcd2index(lrp.odim, tabcd);
-    if (index < count) {
-      index2abcd(lrp.idim, index, tabcd);
-      trans(lrp.itrans, tabcd, iabcd);
+        int tabcd[4];
+        invtrans(lrp.otrans, oabcd, tabcd);
+        int index = abcd2index(lrp.odim, tabcd);
+        if (index < count) {
+            index2abcd(lrp.idim, index, tabcd);
+            trans(lrp.itrans, tabcd, iabcd);
 #if RIN == 4
-      abcd2xyzn_4(iC, iabcd, ixyzn);
+            abcd2xyzn_4(iC, iabcd, ixyzn);
 #else
-      FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
+            FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
 #endif
-      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-    } else {
-      r[n] = 0;
+            r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+        } else {
+            r[n] = 0;
+        }
     }
-  }
-  outTexture.write(r, gid.xy, gid.z);
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
index d2f5815d42..bb155a87a3 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
@@ -18,10 +18,10 @@
 using namespace metal;
 
 struct ReshapeParam {
-  int32_t idim[4];
-  int32_t itrans[4];
-  int32_t odim[4];
-  int32_t otrans[4];
+    int32_t idim[4];
+    int32_t itrans[4];
+    int32_t odim[4];
+    int32_t otrans[4];
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
index fbb4e12cb8..3cca15d551 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
@@ -16,60 +16,60 @@
 using namespace metal;
 
 struct resize_bilinear_param {
-//  int32_t out_h;
-//  int32_t out_w;
-  float ratio_h;
-  float ratio_w;
+    //  int32_t out_h;
+    //  int32_t out_w;
+    float ratio_h;
+    float ratio_w;
 };
 
 kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
-                     texture2d_array<float, access::write> output [[texture(2)]],
-                     constant resize_bilinear_param & pm [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  float4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    float w = gid.x * pm.ratio_w;
-    float h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    float w1lambda = w - w0, h1lambda = h - h0;
-    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    float4 r0 = input.read(uint2(w0, h0), gid.z);
-    float4 r1 = input.read(uint2(w1, h0), gid.z);
-    float4 r2 = input.read(uint2(w0, h1), gid.z);
-    float4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
+                            texture2d_array<float, access::write> output [[texture(2)]],
+                            constant resize_bilinear_param & pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+    float4 r;
+    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+        r = input.read(gid.xy, gid.z);
+    } else {
+        float w = gid.x * pm.ratio_w;
+        float h = gid.y * pm.ratio_h;
+        uint w0 = w, h0 = h;
+        uint w1 = w0 + 1, h1 = h0 + 1;
+        float w1lambda = w - w0, h1lambda = h - h0;
+        float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+        if (w1 >= input.get_width()) w1 = w0;
+        if (h1 >= input.get_height()) h1 = h0;
+        float4 r0 = input.read(uint2(w0, h0), gid.z);
+        float4 r1 = input.read(uint2(w1, h0), gid.z);
+        float4 r2 = input.read(uint2(w0, h1), gid.z);
+        float4 r3 = input.read(uint2(w1, h1), gid.z);
+        r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+    }
+    output.write(r, gid.xy, gid.z);
 }
 
 kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
-                            texture2d_array<half, access::write> output [[texture(2)]],
-                            constant resize_bilinear_param & pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  
-  half4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    half w = gid.x * pm.ratio_w;
-    half h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    half w1lambda = w - w0, h1lambda = h - h0;
-    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    half4 r0 = input.read(uint2(w0, h0), gid.z);
-    half4 r1 = input.read(uint2(w1, h0), gid.z);
-    half4 r2 = input.read(uint2(w0, h1), gid.z);
-    half4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-  output.write(r, gid.xy, gid.z);
+                                 texture2d_array<half, access::write> output [[texture(2)]],
+                                 constant resize_bilinear_param & pm [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    
+    half4 r;
+    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+        r = input.read(gid.xy, gid.z);
+    } else {
+        half w = gid.x * pm.ratio_w;
+        half h = gid.y * pm.ratio_h;
+        uint w0 = w, h0 = h;
+        uint w1 = w0 + 1, h1 = h0 + 1;
+        half w1lambda = w - w0, h1lambda = h - h0;
+        half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+        if (w1 >= input.get_width()) w1 = w0;
+        if (h1 >= input.get_height()) h1 = h0;
+        half4 r0 = input.read(uint2(w0, h0), gid.z);
+        half4 r1 = input.read(uint2(w1, h0), gid.z);
+        half4 r2 = input.read(uint2(w0, h1), gid.z);
+        half4 r3 = input.read(uint2(w1, h1), gid.z);
+        r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+    }
+    output.write(r, gid.xy, gid.z);
+    output.write(r, gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
index ae4ccdef75..62b5fd0c92 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
@@ -10,21 +10,21 @@
 using namespace metal;
 
 kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) return;
-  float w_stride = inTexture.get_width() / outTexture.get_width();
-  float h_stride = inTexture.get_height() / outTexture.get_height();
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-  outTexture.write(input, gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) return;
+    float w_stride = inTexture.get_width() / outTexture.get_width();
+    float h_stride = inTexture.get_height() / outTexture.get_height();
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+    outTexture.write(input, gid);
 }
 
 kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) return;
-  float w_stride = inTexture.get_width() / outTexture.get_width();
-  float h_stride = inTexture.get_height() / outTexture.get_height();
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-  outTexture.write(half4(input), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) return;
+    float w_stride = inTexture.get_width() / outTexture.get_width();
+    float h_stride = inTexture.get_height() / outTexture.get_height();
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+    outTexture.write(half4(input), gid);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
index 455cf1471b..3affcadd79 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
@@ -21,41 +21,41 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                    texture2d_array<P, access::write> outTexture [[texture(1)]],
-                    constant SoftmaxParam &sp [[buffer(0)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-//  int zsize = inTexture.get_array_size();
-  P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
-  int group = sp.K / 4;
-  int remain = sp.K % 4;
-  for (int x = 0; x < group; x++) {
-    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
-  }
-  if (remain > 0) {
-    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-    for (int i = 0; i < remain; i++) {
-      maxv = max(maxv, r[i]);
+                             texture2d_array<P, access::write> outTexture [[texture(1)]],
+                             constant SoftmaxParam &sp [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    //  int zsize = inTexture.get_array_size();
+    P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
+    int group = sp.K / 4;
+    int remain = sp.K % 4;
+    for (int x = 0; x < group; x++) {
+        VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+        maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
     }
-  }
-  VECTOR(P, 4) rsum = {0, 0, 0, 0};
-  for (int x = 0; x < group; x++) {
-    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-    rsum += exp(r - maxv);
-  }
-  P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
-  if (remain > 0) {
-    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-    for (int i = 0; i < remain; i++) {
-      sum += exp(r[i] - maxv);
+    if (remain > 0) {
+        VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+        for (int i = 0; i < remain; i++) {
+            maxv = max(maxv, r[i]);
+        }
     }
-  }
-  VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
-  rr = exp(rr - maxv) / sum;
-  outTexture.write(rr, gid.xy, gid.z);
+    VECTOR(P, 4) rsum = {0, 0, 0, 0};
+    for (int x = 0; x < group; x++) {
+        VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+        rsum += exp(r - maxv);
+    }
+    P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
+    if (remain > 0) {
+        VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+        for (int i = 0; i < remain; i++) {
+            sum += exp(r[i] - maxv);
+        }
+    }
+    VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
+    rr = exp(rr - maxv) / sum;
+    outTexture.write(rr, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
index 67c279a444..f4bc8de4bc 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
@@ -16,8 +16,8 @@
 using namespace metal;
 
 struct SoftmaxParam {
-  int N;
-  int K;
+    int N;
+    int K;
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
index 54e3f21e79..1c9bcc7e18 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
@@ -36,41 +36,41 @@
 
 #if V == VY
 kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                                 texture2d_array<P, access::write> out1 [[texture(1)]],
-                                 texture2d_array<P, access::write> out2 [[texture(2)]],
+                                     texture2d_array<P, access::write> out1 [[texture(1)]],
+                                     texture2d_array<P, access::write> out2 [[texture(2)]],
 #if N >= 3
-                                 texture2d_array<P, access::write> out3 [[texture(3)]],
+                                     texture2d_array<P, access::write> out3 [[texture(3)]],
 #endif // N >= 3
 #if N >= 4
-                                 texture2d_array<P, access::write> out4 [[texture(4)]],
+                                     texture2d_array<P, access::write> out4 [[texture(4)]],
 #endif // N >= 4
-                                 constant SplitParam &sp [[buffer(0)]],
-                                 uint3 gid [[thread_position_in_grid]]) {
-
-  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-  int y = gid.y - sp.offset;
-  if (y < sp.vdim[0]) {
-    out1.write(r, gid.xy, gid.z);
-    return;
-  }
-  y -= sp.vdim[0];
-  if (y < sp.vdim[1]) {
-    out2.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
+                                     constant SplitParam &sp [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+    
+    VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+    int y = gid.y - sp.offset;
+    if (y < sp.vdim[0]) {
+        out1.write(r, gid.xy, gid.z);
+        return;
+    }
+    y -= sp.vdim[0];
+    if (y < sp.vdim[1]) {
+        out2.write(r, uint2(gid.x, y), gid.z);
+        return;
+    }
 #if N >= 3
-  y -= sp.vdim[1];
-  if (y < sp.vdim[2]) {
-    out3.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
+    y -= sp.vdim[1];
+    if (y < sp.vdim[2]) {
+        out3.write(r, uint2(gid.x, y), gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  y -= sp.vdim[2];
-  if (y < sp.vdim[3]) {
-    out4.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
+    y -= sp.vdim[2];
+    if (y < sp.vdim[3]) {
+        out4.write(r, uint2(gid.x, y), gid.z);
+        return;
+    }
 #endif // N >= 4
 }
 #endif // V == VY
@@ -88,30 +88,30 @@ kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[te
 #endif // N >= 4
                                      constant SplitParam &sp [[buffer(0)]],
                                      uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-  int x = gid.x;
-  if (x < sp.vdim[0]) {
-    out1.write(r, gid.xy, gid.z);
-    return;
-  }
-  x -= sp.vdim[0];
-  if (x < sp.vdim[1]) {
-    out2.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
+    VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+    int x = gid.x;
+    if (x < sp.vdim[0]) {
+        out1.write(r, gid.xy, gid.z);
+        return;
+    }
+    x -= sp.vdim[0];
+    if (x < sp.vdim[1]) {
+        out2.write(r, uint2(x, gid.y), gid.z);
+        return;
+    }
 #if N >= 3
-  x -= sp.vdim[1];
-  if (x < sp.vdim[2]) {
-    out3.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
+    x -= sp.vdim[1];
+    if (x < sp.vdim[2]) {
+        out3.write(r, uint2(x, gid.y), gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  x -= sp.vdim[2];
-  if (x < sp.vdim[3]) {
-    out4.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
+    x -= sp.vdim[2];
+    if (x < sp.vdim[3]) {
+        out4.write(r, uint2(x, gid.y), gid.z);
+        return;
+    }
 #endif // N >= 4
 }
 #endif // V == VX
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
index 4c1e818d2b..d167608fbb 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
@@ -18,11 +18,11 @@
 using namespace metal;
 
 struct SplitParam {
-  int32_t idim[4];
-  int32_t axis;
-  int32_t offset;
-  int32_t trans[4];
-  int32_t vdim[4];
+    int32_t idim[4];
+    int32_t axis;
+    int32_t offset;
+    int32_t trans[4];
+    int32_t vdim[4];
 };
 
 #define VNORMAL 1
@@ -36,29 +36,29 @@ struct SplitParam {
 
 //// ssd-ar: (R=3, N=2, V=y)
 #define V VY
-  #define R 3
-    #define N 2
-      #define P float
-        #include "Split.inc.metal"
-      #undef P
-      #define P half
-        #include "Split.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 2
+#define P float
+#include "Split.inc.metal"
+#undef P
+#define P half
+#include "Split.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 //// ssd-ar: (R=2, N=2, V=y)
 #define V VY
-  #define R 2
-    #define N 2
-      #define P float
-        #include "Split.inc.metal"
-      #undef P
-      #define P half
-        #include "Split.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 2
+#define P float
+#include "Split.inc.metal"
+#undef P
+#define P half
+#include "Split.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
index 534166e45f..d80361da46 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
@@ -22,39 +22,39 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                      texture2d_array<P, access::write> outTexture [[texture(1)]],
-                      constant TransposeParam &pm [[buffer(0)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r;
-  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
-  int iabcd[4], oabcd[4], ixyzn[4];
-  for (int n = 0; n < 4; n++) {
-    oxyzn[3] = n;
+                                  texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                  constant TransposeParam &pm [[buffer(0)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+    VECTOR(P, 4) r;
+    int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
+    int iabcd[4], oabcd[4], ixyzn[4];
+    for (int n = 0; n < 4; n++) {
+        oxyzn[3] = n;
 #if R == 4
-    xyzn2abcd_4(pm.oC, oxyzn, iabcd);
+        xyzn2abcd_4(pm.oC, oxyzn, iabcd);
 #endif // R == 4
 #if R == 3
-    xyzn2abcd_3(oxyzn, oabcd);
+        xyzn2abcd_3(oxyzn, oabcd);
 #endif // R == 3
 #if R == 2
-    xyzn2abcd_2(oxyzn, oabcd);
+        xyzn2abcd_2(oxyzn, oabcd);
 #endif // R == 2
-    iabcd[pm.axis[0]] = oabcd[0];
-    iabcd[pm.axis[1]] = oabcd[1];
-    iabcd[pm.axis[2]] = oabcd[2];
-    iabcd[pm.axis[3]] = oabcd[3];
+        iabcd[pm.axis[0]] = oabcd[0];
+        iabcd[pm.axis[1]] = oabcd[1];
+        iabcd[pm.axis[2]] = oabcd[2];
+        iabcd[pm.axis[3]] = oabcd[3];
 #if R == 4
-    abcd2xyzn_4(pm.iC, iabcd, ixyzn);
+        abcd2xyzn_4(pm.iC, iabcd, ixyzn);
 #endif // R == 4
 #if R == 3
-    abcd2xyzn_3(iabcd, ixyzn);
+        abcd2xyzn_3(iabcd, ixyzn);
 #endif // R == 3
 #if R == 2
-    abcd2xyzn_2(iabcd, ixyzn);
+        abcd2xyzn_2(iabcd, ixyzn);
 #endif // R == 2
-    r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-  }
-  outTexture.write(r, gid.xy, gid.z);
+        r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+    }
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
index 321663b9b7..66c22f0388 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
@@ -17,47 +17,47 @@
 using namespace metal;
 
 struct TransposeParam {
-  int iC;
-  int oC;
-  int axis[4];
+    int iC;
+    int oC;
+    int axis[4];
 };
 
 kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-                           constant TransposeParam &pm [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+                                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                 constant TransposeParam &pm [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
 }
 kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outTexture [[texture(1)]],
-                           constant TransposeParam &pm [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+                                texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                constant TransposeParam &pm [[buffer(0)]],
+                                uint3 gid [[thread_position_in_grid]]) {
+    outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
 }
 
 #define R 4
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
+#define P float
+#include "TransposeKernel.inc.metal"
+#undef P
+#define P half
+#include "TransposeKernel.inc.metal"
+#undef P
 #undef R
 
 #define R 3
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
+#define P float
+#include "TransposeKernel.inc.metal"
+#undef P
+#define P half
+#include "TransposeKernel.inc.metal"
+#undef P
 #undef R
 
 #define R 2
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
+#define P float
+#include "TransposeKernel.inc.metal"
+#undef P
+#define P half
+#include "TransposeKernel.inc.metal"
+#undef P
 #undef R
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
index 7817befaed..29730fd3b6 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
@@ -1,11 +1,11 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
-
+ 
  http://www.apache.org/licenses/LICENSE-2.0
-
+ 
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,35 +16,35 @@ import UIKit
 
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
-
+    
     var window: UIWindow?
-
+    
     func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
         // Override point for customization after application launch.
         return true
     }
-
+    
     func applicationWillResignActive(_ application: UIApplication) {
         // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
         // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
     }
-
+    
     func applicationDidEnterBackground(_ application: UIApplication) {
         // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
         // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
     }
-
+    
     func applicationWillEnterForeground(_ application: UIApplication) {
         // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
     }
-
+    
     func applicationDidBecomeActive(_ application: UIApplication) {
         // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
     }
-
+    
     func applicationWillTerminate(_ application: UIApplication) {
         // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
     }
-
-
+    
+    
 }
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
index 98f03affa2..4c5886c7c1 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
@@ -1,11 +1,11 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
-
+ 
  http://www.apache.org/licenses/LICENSE-2.0
-
+ 
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,9 +27,9 @@ class ViewController: UIViewController {
             inQueue: queue
         )
         test.testConcat()
-//        test.testReshape()
-//        test.testTranspose()
+        //        test.testReshape()
+        //        test.testTranspose()
         print(" done ")
     }
-
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
index 3aa4e88541..afa580e3cb 100644
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -741,7 +741,7 @@
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = A798K58VVL;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -778,7 +778,7 @@
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = A798K58VVL;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
diff --git a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
index da66460d8b..ba15d890a4 100644
--- a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
@@ -15,24 +15,26 @@
 import Foundation
 
 @objc public enum MetalLoadMode: Int {
-  case
-  LoadMetalInPaddleMobile   = 1,     // 使用 paddle-mobile 中的 metal 代码
-  LoadMetalInDefaultLib     = 2,     // 使用 main bundle 中的 metal 代码
-  LoadMetalInCustomMetalLib = 3      // 使用 metal 库文件
+    case
+    LoadMetalInPaddleMobile   = 1,     // 使用 paddle-mobile 中的 metal 代码
+    LoadMetalInDefaultLib     = 2,     // 使用 main bundle 中的 metal 代码
+    LoadMetalInCustomMetalLib = 3      // 使用 metal 库文件
 }
 
 @objc public enum ComputePrecision: Int {
-  case
-  Float32 = 1,
-  Float16 = 2
+    case
+    Float32 = 1,
+    Float16 = 2
 }
 
 @objc public class GlobalConfig: NSObject {
-  
-  /// 单例
-  @objc public static let shared: GlobalConfig = GlobalConfig.init()
-  
-  /// 运算精度， runner 生命周期中不可变
-  @objc public var computePrecision: ComputePrecision = .Float16
-
+    
+    /// 单例
+    @objc public static let shared: GlobalConfig = GlobalConfig.init()
+    
+    /// 运算精度， runner 生命周期中不可变
+    @objc public var computePrecision: ComputePrecision = .Float16
+    
+    /// 是否开启 log
+    @objc public var debug: Bool = true;
 }
diff --git a/metal/paddle-mobile/paddle-mobile/API/Net.swift b/metal/paddle-mobile/paddle-mobile/API/Net.swift
index 33cedb5712..5087ebfd82 100644
--- a/metal/paddle-mobile/paddle-mobile/API/Net.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/Net.swift
@@ -17,74 +17,74 @@ import Foundation
 
 /// 网络的基类， 参数已经给了默认值，请在子类实现中修改需要改的参数
 @objc open class Net: NSObject {
-
-  /// 默认为0， 如果指定个数， 后边 except 个op不使用 GPU 运算， 中间结果会通过 fetchResult 传参过来
-  @objc public var except: Int = 0
-      
-  /// 预处理 kernel， 如果输入图像需要预处理， 则指定预处理 kernel
-  @objc public var preprocessKernel: CusomKernel? = nil
-  
-  // 以下四个参数为从内存中读取模型时用到的参数
-  /// 模型在内存中的指针
-  @objc public var modelPointer: UnsafeMutableRawPointer? = nil
-  
-  /// 模型大小 单位： 字节
-  @objc public var modelSize: Int = 0
-  
-  /// 权重参数在内存中的指针
-  @objc public var paramPointer: UnsafeMutableRawPointer? = nil
-  
-  /// 权重大小 单位： 字节
-  @objc public var paramSize: Int = 0
-  
-  // 以下两个为从文件中读取模型时用到的参数
-  /// 模型文件路径
-  @objc public var modelPath: String? = nil
-  
-  /// 权重文件路径
-  @objc public var paramPath: String? = nil
-  
-  /// 代表着 GPU 处理器
-  @objc public let device: MTLDevice
-  
-  /// metal 代码加载方式 注意： 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码
-  @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile
-  
-  /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
-  @objc public var metalLibPath: String? = nil
-  
-  /// 输入维度，按照 n h w c 方式传入
-  @objc public var inputDim: Dim = Dim.init(inDim: [])
-  
-  
-  @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    self.paramPointer = inParamPointer
-    self.paramSize = inParamSize
-    self.modelPointer = inModelPointer
-    self.modelSize = inModelSize
-    self.device = device
-    super.init()
-  }
-  
-  @objc public init(device: MTLDevice) {
-    self.device = device
-    super.init()
-  }
-  
-  @objc open func resultStr(res: [ResultHolder]) -> String {
-    fatalError()
-  }
-  
-  @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-    return paddleMobileRes.map { (gpuRes) -> ResultHolder in
-      guard let inResPointer = gpuRes.resultPointer else {
+    
+    /// 默认为0， 如果指定个数， 后边 except 个op不使用 GPU 运算， 中间结果会通过 fetchResult 传参过来
+    @objc public var except: Int = 0
+    
+    /// 预处理 kernel， 如果输入图像需要预处理， 则指定预处理 kernel
+    @objc public var preprocessKernel: CusomKernel? = nil
+    
+    // 以下四个参数为从内存中读取模型时用到的参数
+    /// 模型在内存中的指针
+    @objc public var modelPointer: UnsafeMutableRawPointer? = nil
+    
+    /// 模型大小 单位： 字节
+    @objc public var modelSize: Int = 0
+    
+    /// 权重参数在内存中的指针
+    @objc public var paramPointer: UnsafeMutableRawPointer? = nil
+    
+    /// 权重大小 单位： 字节
+    @objc public var paramSize: Int = 0
+    
+    // 以下两个为从文件中读取模型时用到的参数
+    /// 模型文件路径
+    @objc public var modelPath: String? = nil
+    
+    /// 权重文件路径
+    @objc public var paramPath: String? = nil
+    
+    /// 代表着 GPU 处理器
+    @objc public let device: MTLDevice
+    
+    /// metal 代码加载方式 注意： 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码
+    @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile
+    
+    /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
+    @objc public var metalLibPath: String? = nil
+    
+    /// 输入维度，按照 n h w c 方式传入
+    @objc public var inputDim: Dim = Dim.init(inDim: [])
+    
+    
+    @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        self.paramPointer = inParamPointer
+        self.paramSize = inParamSize
+        self.modelPointer = inModelPointer
+        self.modelSize = inModelSize
+        self.device = device
+        super.init()
+    }
+    
+    @objc public init(device: MTLDevice) {
+        self.device = device
+        super.init()
+    }
+    
+    @objc open func resultStr(res: [ResultHolder]) -> String {
         fatalError()
-      }
-      return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim)
     }
-  }
-  
-  open func updateProgram(program: Program) {
-  }
-  
+    
+    @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
+        return paddleMobileRes.map { (gpuRes) -> ResultHolder in
+            guard let inResPointer = gpuRes.resultPointer else {
+                fatalError()
+            }
+            return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim)
+        }
+    }
+    
+    open func updateProgram(program: Program) {
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/API/Runner.swift b/metal/paddle-mobile/paddle-mobile/API/Runner.swift
index 2d7bf9d190..d6c30805eb 100644
--- a/metal/paddle-mobile/paddle-mobile/API/Runner.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/Runner.swift
@@ -16,187 +16,187 @@ import MetalKit
 import Foundation
 
 @objc public class ResultHolder: NSObject {
-  @objc public let result: UnsafeMutablePointer<Float32>
-  @objc public let capacity: Int
-  @objc public let dim: [Int]
-  
-  init(inResult: UnsafeMutablePointer<Float32>, inCapacity: Int, inDim: [Int]) {
-    result = inResult
-    capacity = inCapacity
-    dim = inDim
-  }
-  
-  @objc public func releasePointer() {
-    result.deinitialize(count: capacity)
-    result.deallocate()
-  }
+    @objc public let result: UnsafeMutablePointer<Float32>
+    @objc public let capacity: Int
+    @objc public let dim: [Int]
+    
+    init(inResult: UnsafeMutablePointer<Float32>, inCapacity: Int, inDim: [Int]) {
+        result = inResult
+        capacity = inCapacity
+        dim = inDim
+    }
+    
+    @objc public func releasePointer() {
+        result.deinitialize(count: capacity)
+        result.deallocate()
+    }
 }
 
 @objc public class Runner: NSObject {
-  var program: Program?
-  var executor: Executor<Float32>?
-  var queue: MTLCommandQueue?
-  var textureLoader: MTKTextureLoader?
-  public let net: Net
-  let device: MTLDevice?
-  let numel: Int
-  
-  /// 初始化函数
-  ///
-  /// - Parameters:
-  ///   - inNet: 传入自定义的网络
-  ///   - commandQueue: commandQueue
-  @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) {
-    guard inNet.inputDim.cout() == 4 else {
-      fatalError(" input dim count must 4 ")
-    }
+    var program: Program?
+    var executor: Executor<Float32>?
+    var queue: MTLCommandQueue?
+    var textureLoader: MTKTextureLoader?
+    public let net: Net
+    let device: MTLDevice?
+    let numel: Int
     
-    net = inNet
-    queue = commandQueue
-    device = queue?.device
-    if let inDevice = device {
-      textureLoader = MTKTextureLoader.init(device: inDevice)
-    }
-    numel = net.inputDim.numel()
-  }
-  
-  /// load 模型, 返回 true 可进行预测
-  ///
-  /// - Returns: load 成功或失败
-  @objc public func load() -> Bool {
-      guard let inDevice = device, let inQueue = queue else {
-        print(" paddle mobile gpu load error, need MTLCommandQueue")
-        return false
-      }
-      let loader = Loader<Float32>.init()
-      do {
-        
-        if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer {
-          guard net.paramSize > 0 && net.modelSize > 0 else {
-            print(" load from memory param size or model size can't 0 ")
-            return false
-          }
-          program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize)
-        } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath {
-          program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath)
-        } else {
-          print(" model pointer or model file path need be specified")
-          return false
+    /// 初始化函数
+    ///
+    /// - Parameters:
+    ///   - inNet: 传入自定义的网络
+    ///   - commandQueue: commandQueue
+    @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) {
+        guard inNet.inputDim.cout() == 4 else {
+            fatalError(" input dim count must 4 ")
         }
         
-        let initContext: InitContext = InitContext.init()
-        initContext.metalLoadMode = net.metalLoadMode
-        initContext.metalLibPath = net.metalLibPath
-        executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext)
-        net.updateProgram(program: program!)
-      } catch let error {
-        print(error)
-        return false
-      }
-    return true
-  }
-  
-  /// 预测
-  ///
-  /// - Parameters:
-  ///   - texture: 输入 texture 需要使用 getTexture 获得
-  ///   - completion: 结果回调， 当 success 为 true 时 result 不为 nil
-  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) {
-    do {
-      try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in
-        guard let SSelf = self else {
-          fatalError( " self nil " )
+        net = inNet
+        queue = commandQueue
+        device = queue?.device
+        if let inDevice = device {
+            textureLoader = MTKTextureLoader.init(device: inDevice)
         }
-        let result = SSelf.net.fetchResult(paddleMobileRes: res)
-        completion(true, result)
-        }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
-    } catch let error {
-      print(error)
-      completion(false, nil)
-      return
-    }
-  }
-  
-  /// 清理内存, 调用此函数后, 不能再使用, 需重新 load
-  @objc public func clear() {
-    executor?.clear()
-    executor = nil
-    program = nil
-  }
-  
-  /// 获取 texture, 对 texture 进行预处理, 预测时使用
-  ///
-  /// - Parameters:
-  ///   - image: 输入图像
-  ///   - getTexture: 获取 texture 回调
-  @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
-    let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
-    scaleTexture(input: texture!, complete: getTexture)
-  }
-  
-  /// 通过 buffer 获取 texture， 内部会使用GPU进行转换操作
-  ///
-  /// - Parameters:
-  ///   - inBuffer: 输入buffer
-  ///   - getTexture: 结果回调
-  @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) {
-    guard let inQueue = queue, let inDevice = device else {
-      fatalError( " queue or devcie nil " )
+        numel = net.inputDim.numel()
     }
     
-    guard let buffer = inQueue.makeCommandBuffer() else {
-      fatalError( " make buffer error" )
-    }
-    
-    let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
-    do {
-      try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer)
-    } catch {
-      fatalError(" bufferToTextureKernel error ")
+    /// load 模型, 返回 true 可进行预测
+    ///
+    /// - Returns: load 成功或失败
+    @objc public func load() -> Bool {
+        guard let inDevice = device, let inQueue = queue else {
+            print(" paddle mobile gpu load error, need MTLCommandQueue")
+            return false
+        }
+        let loader = Loader<Float32>.init()
+        do {
+            
+            if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer {
+                guard net.paramSize > 0 && net.modelSize > 0 else {
+                    print(" load from memory param size or model size can't 0 ")
+                    return false
+                }
+                program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize)
+            } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath {
+                program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath)
+            } else {
+                print(" model pointer or model file path need be specified")
+                return false
+            }
+            
+            let initContext: InitContext = InitContext.init()
+            initContext.metalLoadMode = net.metalLoadMode
+            initContext.metalLibPath = net.metalLibPath
+            executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext)
+            net.updateProgram(program: program!)
+        } catch let error {
+            print(error)
+            return false
+        }
+        return true
     }
     
-    buffer.addCompletedHandler { (buffer) in
-      getTexture(bufferToTextureKernel.outputTexture)
+    /// 预测
+    ///
+    /// - Parameters:
+    ///   - texture: 输入 texture 需要使用 getTexture 获得
+    ///   - completion: 结果回调， 当 success 为 true 时 result 不为 nil
+    @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) {
+        do {
+            try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in
+                guard let SSelf = self else {
+                    fatalError( " self nil " )
+                }
+                let result = SSelf.net.fetchResult(paddleMobileRes: res)
+                completion(true, result)
+                }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
+        } catch let error {
+            print(error)
+            completion(false, nil)
+            return
+        }
     }
     
-    buffer.commit()
-  }
-
-  /// 更新输入维度， 针对可变长输入模型
-  ///
-  /// - Parameter inDim: 输入维度
-  @objc public func updateInputDim(inDim: Dim) {
-    if net.inputDim != inDim {
-      guard let inProgram = program else {
-        fatalError(" need load first ")
-      }
-      net.inputDim = inDim
-      net.updateProgram(program: inProgram)
+    /// 清理内存, 调用此函数后, 不能再使用, 需重新 load
+    @objc public func clear() {
+        executor?.clear()
+        executor = nil
+        program = nil
     }
-  }
-  
-  public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
     
-    guard let inQueue = queue, let inDevice = device else {
-      fatalError( " queue or devcie nil " )
+    /// 获取 texture, 对 texture 进行预处理, 预测时使用
+    ///
+    /// - Parameters:
+    ///   - image: 输入图像
+    ///   - getTexture: 获取 texture 回调
+    @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
+        let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
+        scaleTexture(input: texture!, complete: getTexture)
     }
     
-    guard let buffer = inQueue.makeCommandBuffer() else {
-      fatalError( " make buffer error" )
+    /// 通过 buffer 获取 texture， 内部会使用GPU进行转换操作
+    ///
+    /// - Parameters:
+    ///   - inBuffer: 输入buffer
+    ///   - getTexture: 结果回调
+    @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) {
+        guard let inQueue = queue, let inDevice = device else {
+            fatalError( " queue or devcie nil " )
+        }
+        
+        guard let buffer = inQueue.makeCommandBuffer() else {
+            fatalError( " make buffer error" )
+        }
+        
+        let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
+        do {
+            try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer)
+        } catch {
+            fatalError(" bufferToTextureKernel error ")
+        }
+        
+        buffer.addCompletedHandler { (buffer) in
+            getTexture(bufferToTextureKernel.outputTexture)
+        }
+        
+        buffer.commit()
     }
     
-    let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
-    
-    do {
-      try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
-    } catch let error {
-      print(error)
-      fatalError()
+    /// 更新输入维度， 针对可变长输入模型
+    ///
+    /// - Parameter inDim: 输入维度
+    @objc public func updateInputDim(inDim: Dim) {
+        if net.inputDim != inDim {
+            guard let inProgram = program else {
+                fatalError(" need load first ")
+            }
+            net.inputDim = inDim
+            net.updateProgram(program: inProgram)
+        }
     }
     
-    buffer.addCompletedHandler { (buffer) in
-      complete(scaleKernel.outputTexture)
+    public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
+        
+        guard let inQueue = queue, let inDevice = device else {
+            fatalError( " queue or devcie nil " )
+        }
+        
+        guard let buffer = inQueue.makeCommandBuffer() else {
+            fatalError( " make buffer error" )
+        }
+        
+        let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
+        
+        do {
+            try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
+        } catch let error {
+            print(error)
+            fatalError()
+        }
+        
+        buffer.addCompletedHandler { (buffer) in
+            complete(scaleKernel.outputTexture)
+        }
+        buffer.commit()
     }
-    buffer.commit()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
index 12bc909be9..64786d0a45 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
@@ -16,128 +16,128 @@ import Foundation
 
 // 自定义 ?!  如果 ?! 前的返回值为一个可选值, 则进行隐式解包, 如果有值则返回这个值, 如果为nil 则fatalError 传入的信息
 precedencegroup ExecutedOrFatalError{
-  associativity: left
-  higherThan: AssignmentPrecedence
+    associativity: left
+    higherThan: AssignmentPrecedence
 }
 infix operator ?!: ExecutedOrFatalError
 public func ?!<T>(option: T?, excuteOrError: @autoclosure () -> String) -> T{
-  if let inOpt = option {
-    return inOpt
-  }else{
-    print(excuteOrError())
-    fatalError(excuteOrError())
-  }
+    if let inOpt = option {
+        return inOpt
+    }else{
+        print(excuteOrError())
+        fatalError(excuteOrError())
+    }
 }
 
 //Lense
 struct Lense<A, B> {
-  let from: (A) -> B
-  let to: (B, A) -> A
+    let from: (A) -> B
+    let to: (B, A) -> A
 }
 
 precedencegroup CombineLense{
-  associativity: left
-  higherThan: AssignmentPrecedence
+    associativity: left
+    higherThan: AssignmentPrecedence
 }
 
 infix operator >>>: CombineLense
 func >>><A, B, C>(left: Lense<B, C>, right: Lense<A, B>) -> Lense<A, C> {
-  return Lense<A, C>.init(from: { (a) -> C in
-    left.from(right.from(a))
-  }, to: { (c, a) -> A in
-    right.to( left.to(c, right.from(a)),a)
-  })
+    return Lense<A, C>.init(from: { (a) -> C in
+        left.from(right.from(a))
+    }, to: { (c, a) -> A in
+        right.to( left.to(c, right.from(a)),a)
+    })
 }
 
 protocol CIntIndex {
-  associatedtype T;
-  subscript(index: CInt) -> T { get set};
+    associatedtype T;
+    subscript(index: CInt) -> T { get set};
 }
 
 extension Array: CIntIndex{
-  typealias T = Element
-  subscript(index: CInt) -> T {
-    get{
-      guard Int64(Int.max) >= Int64(index) else{
-        fatalError("cint index out of Int range")
-      }
-      return self[Int(index)]
-    }
-    set{
-      guard Int64(Int.max) >= Int64(index) else{
-        fatalError("cint index out of Int range")
-      }
-      self[Int(index)] = newValue
+    typealias T = Element
+    subscript(index: CInt) -> T {
+        get{
+            guard Int64(Int.max) >= Int64(index) else{
+                fatalError("cint index out of Int range")
+            }
+            return self[Int(index)]
+        }
+        set{
+            guard Int64(Int.max) >= Int64(index) else{
+                fatalError("cint index out of Int range")
+            }
+            self[Int(index)] = newValue
+        }
+        
     }
-    
-  }
 }
 
 extension Array where Element: AnyObject{
-  mutating func remove(element: Element) {
-    if let index = index(where: { (node) -> Bool in
-      return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
-    }) {
-      remove(at: index)
+    mutating func remove(element: Element) {
+        if let index = index(where: { (node) -> Bool in
+            return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
+        }) {
+            remove(at: index)
+        }
     }
-  }
-  
+    
 }
 
 //MARK: Array extension
 extension Array where Element: Comparable{
-  
-  /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
-  ///
-  /// - Parameter r: 前 r 个元素
-  /// - Returns: [(原有位置, 排好位置的元素)]
-  public func top(r: Int) -> [(Int, Element)] {
-    precondition(r <= self.count)
-    return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
-  }
+    
+    /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
+    ///
+    /// - Parameter r: 前 r 个元素
+    /// - Returns: [(原有位置, 排好位置的元素)]
+    public func top(r: Int) -> [(Int, Element)] {
+        precondition(r <= self.count)
+        return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
+    }
 }
 
 extension Array {
-  public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
-    if count < inCount {
-      return (0..<count).map{ ($0, self[$0]) }
-    } else {
-      let stride = count / inCount
-      var newArray: [(Int, Element)] = []
-      for i in 0..<inCount {
-        newArray.append((i * stride, self[i * stride]))
-      }
-      return newArray
+    public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
+        if count < inCount {
+            return (0..<count).map{ ($0, self[$0]) }
+        } else {
+            let stride = count / inCount
+            var newArray: [(Int, Element)] = []
+            for i in 0..<inCount {
+                newArray.append((i * stride, self[i * stride]))
+            }
+            return newArray
+        }
     }
-  }
-  
-  public static func floatArrWithBuffer(floatArrBuffer: UnsafeMutablePointer<Float32>, count: Int) -> [Float32] {
-    var arr: [Float32] = []
-    for i in 0..<count {
-      arr.append(floatArrBuffer[i])
+    
+    public static func floatArrWithBuffer(floatArrBuffer: UnsafeMutablePointer<Float32>, count: Int) -> [Float32] {
+        var arr: [Float32] = []
+        for i in 0..<count {
+            arr.append(floatArrBuffer[i])
+        }
+        return arr
     }
-    return arr
-  }
 }
 
 extension UnsafeMutablePointer {
-  public func floatArr(count: Int) -> [Pointee]{
-    var arr: [Pointee] = []
-    for i in 0..<count {
-      arr.append(self[i])
+    public func floatArr(count: Int) -> [Pointee]{
+        var arr: [Pointee] = []
+        for i in 0..<count {
+            arr.append(self[i])
+        }
+        return arr
     }
-    return arr
-  }
 }
 
 extension String {
-  func cStr() -> UnsafePointer<Int8>? {
-    return (self as NSString).utf8String
-  }
+    func cStr() -> UnsafePointer<Int8>? {
+        return (self as NSString).utf8String
+    }
 }
 
 func address<T: AnyObject>(o: T) -> String {
-  return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
+    return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
index c3ba777b27..35fffb52ec 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
@@ -21,615 +21,615 @@ fileprivate var paddleMobileMetalLibrary: MTLLibrary?
 fileprivate var customMetalLibrary: MTLLibrary?
 
 extension MTLDevice {
-  func defaultLibrary() -> MTLLibrary {
-    if defaultMetalLibrary == nil {
-      defaultMetalLibrary = makeDefaultLibrary()
-    }
-    if let inDefaultLib = defaultMetalLibrary {
-      return inDefaultLib
-    } else {
-      fatalError(" default metal libary is nil")
-    }
-  }
-  
-  func customLibrary(metalLibPath: String) -> MTLLibrary {
-    if customMetalLibrary == nil {
-      do {
-        customMetalLibrary = try makeLibrary(filepath: metalLibPath)
-      } catch  let error {
-        fatalError("\(error)")
-      }
+    func defaultLibrary() -> MTLLibrary {
+        if defaultMetalLibrary == nil {
+            defaultMetalLibrary = makeDefaultLibrary()
+        }
+        if let inDefaultLib = defaultMetalLibrary {
+            return inDefaultLib
+        } else {
+            fatalError(" default metal libary is nil")
+        }
     }
     
-    if let inMetalLib = customMetalLibrary {
-      return inMetalLib
-    } else {
-      fatalError(" customlib is nil ")
-    }
-  }
-  
-  func paddleMobileLibrary() -> MTLLibrary {
-    if paddleMobileMetalLibrary == nil {
-      guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
-        fatalError("Counld't find paddle mobile library")
-      }
-      do {
-        paddleMobileMetalLibrary = try makeLibrary(filepath: path)
-      } catch _ {
-        fatalError("Counld't load paddle mobile library")
-      }
+    func customLibrary(metalLibPath: String) -> MTLLibrary {
+        if customMetalLibrary == nil {
+            do {
+                customMetalLibrary = try makeLibrary(filepath: metalLibPath)
+            } catch  let error {
+                fatalError("\(error)")
+            }
+        }
+        
+        if let inMetalLib = customMetalLibrary {
+            return inMetalLib
+        } else {
+            fatalError(" customlib is nil ")
+        }
     }
     
-    if let inPaddleMobileLib = paddleMobileMetalLibrary {
-      return inPaddleMobileLib
-    } else {
-      fatalError("PaddleMobile metal libary is nil")
-    }
-  }
-  
-  func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState {
-    let useLib: MTLLibrary
-    switch metalLoadMode {
-    case .LoadMetalInDefaultLib:
-      useLib = defaultLibrary()
-    case .LoadMetalInPaddleMobile:
-      useLib = paddleMobileLibrary()
-    case .LoadMetalInCustomMetalLib:
-      useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ")
-    default:
-      fatalError()
+    func paddleMobileLibrary() -> MTLLibrary {
+        if paddleMobileMetalLibrary == nil {
+            guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
+                fatalError("Counld't find paddle mobile library")
+            }
+            do {
+                paddleMobileMetalLibrary = try makeLibrary(filepath: path)
+            } catch _ {
+                fatalError("Counld't load paddle mobile library")
+            }
+        }
+        
+        if let inPaddleMobileLib = paddleMobileMetalLibrary {
+            return inPaddleMobileLib
+        } else {
+            fatalError("PaddleMobile metal libary is nil")
+        }
     }
     
-    guard let function = useLib.makeFunction(name: funcName) else {
-      fatalError(" function " + funcName + " not found")
-    }
-    do {
-      let pipLine = try makeComputePipelineState(function: function)
-      return pipLine
-    } catch let error {
-      print(error)
-      fatalError("make pip line error occured : \(error)")
+    func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState {
+        let useLib: MTLLibrary
+        switch metalLoadMode {
+        case .LoadMetalInDefaultLib:
+            useLib = defaultLibrary()
+        case .LoadMetalInPaddleMobile:
+            useLib = paddleMobileLibrary()
+        case .LoadMetalInCustomMetalLib:
+            useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ")
+        default:
+            fatalError()
+        }
+        
+        guard let function = useLib.makeFunction(name: funcName) else {
+            fatalError(" function " + funcName + " not found")
+        }
+        do {
+            let pipLine = try makeComputePipelineState(function: function)
+            return pipLine
+        } catch let error {
+            print(error)
+            fatalError("make pip line error occured : \(error)")
+        }
+        
     }
     
-  }
-  
-  func makeBuffer<P>(value: [P]) -> MTLBuffer {
-    let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
-    let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
-    for i in 0..<value.count {
-      contents?[i] = value[i]
-    }
-    return buffer!
-  }
-  
-  func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
-    let bpR = texture.width * 4 * MemoryLayout<P>.size
-    let bpI = texture.height * bpR
-    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
-    for i in 0..<texture.arrayLength {
-      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
-      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
-      for tx in 0..<texture.width * texture.height * 4 {
-        var k = tx
-        var xyzn: [Int] = [0, 0, 0, 0]
-        xyzn[1] = k / (texture.width * 4)
-        k %= (texture.width * 4)
-        xyzn[3] = k % 4
-        xyzn[0] = k / 4
-        xyzn[2] = i
-        cb(xyzn, pointer[tx])
-      }
-    }
-  }
-  
-  func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
-    }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
-    assert(dim.count == 3)
-    assert(texture.width == ndim[3])
-    assert(texture.height == ndim[2])
-    assert(ndim[0] == 1)
-    assert(texture.arrayLength == (ndim[1] + 3) / 4)
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[1] = xyzn[2] * 4 + xyzn[3]
-      tg[2] = xyzn[1]
-      tg[3] = xyzn[0]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func makeBuffer<P>(value: [P]) -> MTLBuffer {
+        let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
+        let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
+        for i in 0..<value.count {
+            contents?[i] = value[i]
+        }
+        return buffer!
     }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
-    assert(dim.count == 2)
-    let w = (ndim[3] + 3) / 4
-    assert(texture.width == w)
-    assert(texture.height == ndim[2])
-    assert(ndim[0] == 1)
-    assert(ndim[1] == 1)
-    assert(texture.arrayLength == 1)
     
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[2] = xyzn[1]
-      tg[3] = xyzn[0] * 4 + xyzn[3]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
+        let bpR = texture.width * 4 * MemoryLayout<P>.size
+        let bpI = texture.height * bpR
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
+        for i in 0..<texture.arrayLength {
+            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
+            texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+            for tx in 0..<texture.width * texture.height * 4 {
+                var k = tx
+                var xyzn: [Int] = [0, 0, 0, 0]
+                xyzn[1] = k / (texture.width * 4)
+                k %= (texture.width * 4)
+                xyzn[3] = k % 4
+                xyzn[0] = k / 4
+                xyzn[2] = i
+                cb(xyzn, pointer[tx])
+            }
+        }
     }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
-    assert(dim.count == 1)
-    let w = (ndim[3] + 3) / 4
-    assert(texture.width == w)
-    assert(texture.height == 1)
-    assert(ndim[0] == 1)
-    assert(ndim[1] == 1)
-    assert(ndim[2] == 1)
-    assert(texture.arrayLength == 1)
     
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[3] = xyzn[0] * 4 + xyzn[3]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    if dim.count == 3 {
-      return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
-    } else if dim.count == 2 {
-      return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
-    } else if dim.count == 1 {
-      return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
-    }
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        assert(dim.count == 3)
+        assert(texture.width == ndim[3])
+        assert(texture.height == ndim[2])
+        assert(ndim[0] == 1)
+        assert(texture.arrayLength == (ndim[1] + 3) / 4)
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[1] = xyzn[2] * 4 + xyzn[3]
+            tg[2] = xyzn[1]
+            tg[3] = xyzn[0]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
     
-    assert(texture.width == ndim[2])
-    assert(texture.height == ndim[1])
-    assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
-    
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[1] = xyzn[1]
-      tg[2] = xyzn[0]
-      tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
-      tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
-    if value.count > 0 {
-      assert(value.count == dim.reduce(1) { $0 * $1 })
+    func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        assert(dim.count == 2)
+        let w = (ndim[3] + 3) / 4
+        assert(texture.width == w)
+        assert(texture.height == ndim[2])
+        assert(ndim[0] == 1)
+        assert(ndim[1] == 1)
+        assert(texture.arrayLength == 1)
+        
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[2] = xyzn[1]
+            tg[3] = xyzn[0] * 4 + xyzn[3]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
     
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        assert(dim.count == 1)
+        let w = (ndim[3] + 3) / 4
+        assert(texture.width == w)
+        assert(texture.height == 1)
+        assert(ndim[0] == 1)
+        assert(ndim[1] == 1)
+        assert(ndim[2] == 1)
+        assert(texture.arrayLength == 1)
+        
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[3] = xyzn[0] * 4 + xyzn[3]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
-    let ndim: [Int] = transpose.map { tdim[$0] }
     
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.width = ndim[2]
-    textureDesc.height = ndim[1]
-    textureDesc.depth = 1
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    
-    if inComputePrecision == .Float16 {
-      textureDesc.pixelFormat = .rgba16Float
-    } else if inComputePrecision == .Float32 {
-      textureDesc.pixelFormat = .rgba32Float
+    func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        if dim.count == 3 {
+            return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
+        } else if dim.count == 2 {
+            return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
+        } else if dim.count == 1 {
+            return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
+        }
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        
+        assert(texture.width == ndim[2])
+        assert(texture.height == ndim[1])
+        assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
+        
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[1] = xyzn[1]
+            tg[2] = xyzn[0]
+            tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
+            tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
     
-    textureDesc.textureType = .type2DArray
-    textureDesc.storageMode = .shared
-    textureDesc.cpuCacheMode = .defaultCache
-    textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
-    let texture = makeTexture(descriptor: textureDesc)!
-    
-    if value.count > 0 {
-      var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
-      rcount = rcount * 4 * ndim[1] * ndim[2]
-      var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
-      
-      for i0 in 0..<tdim[0] {
-        for i1 in 0..<tdim[1] {
-          for i2 in 0..<tdim[2] {
-            for i3 in 0..<tdim[3] {
-              let ig = [i0, i1, i2, i3]
-              let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
-              
-              let jg = transpose.map { ig[$0] }
-              let k = jg[0] * ndim[3] + jg[3]
-              let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
-              
-              nvalue[jx] = value[ix] as! Float32
-            }
-          }
-        }
-      }
-      
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
-      if inComputePrecision == .Float16 {
-        let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
-        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
-        let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
-        float32ToFloat16(input: pointer, output: outputP, count: rcount)
-        let bpR = ndim[2] * 4 * 2
-        let bpI = ndim[1] * bpR
-        for i in 0..<textureDesc.arrayLength {
-          let p = outputP + texture.width * texture.height * 4 * i
-          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
-        }
-      } else {
-        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
-        let bpR = ndim[2] * 4 * MemoryLayout<P>.size
-        let bpI = ndim[1] * bpR
-        for i in 0..<textureDesc.arrayLength {
-          let p = pointer + texture.width * texture.height * 4 * i
-          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
-        }
-      }
+    func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
+        if value.count > 0 {
+            assert(value.count == dim.reduce(1) { $0 * $1 })
+        }
+        
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.width = ndim[2]
+        textureDesc.height = ndim[1]
+        textureDesc.depth = 1
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        
+        if inComputePrecision == .Float16 {
+            textureDesc.pixelFormat = .rgba16Float
+        } else if inComputePrecision == .Float32 {
+            textureDesc.pixelFormat = .rgba32Float
+        }
+        
+        textureDesc.textureType = .type2DArray
+        textureDesc.storageMode = .shared
+        textureDesc.cpuCacheMode = .defaultCache
+        textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
+        let texture = makeTexture(descriptor: textureDesc)!
+        
+        if value.count > 0 {
+            var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
+            rcount = rcount * 4 * ndim[1] * ndim[2]
+            var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
+            
+            for i0 in 0..<tdim[0] {
+                for i1 in 0..<tdim[1] {
+                    for i2 in 0..<tdim[2] {
+                        for i3 in 0..<tdim[3] {
+                            let ig = [i0, i1, i2, i3]
+                            let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
+                            
+                            let jg = transpose.map { ig[$0] }
+                            let k = jg[0] * ndim[3] + jg[3]
+                            let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
+                            
+                            nvalue[jx] = value[ix] as! Float32
+                        }
+                    }
+                }
+            }
+            
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+            if inComputePrecision == .Float16 {
+                let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
+                let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+                let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
+                float32ToFloat16(input: pointer, output: outputP, count: rcount)
+                let bpR = ndim[2] * 4 * 2
+                let bpI = ndim[1] * bpR
+                for i in 0..<textureDesc.arrayLength {
+                    let p = outputP + texture.width * texture.height * 4 * i
+                    texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+                }
+            } else {
+                let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+                let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+                let bpI = ndim[1] * bpR
+                for i in 0..<textureDesc.arrayLength {
+                    let p = pointer + texture.width * texture.height * 4 * i
+                    texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+                }
+            }
+        }
+        return texture
     }
-    return texture
-  }
-  
-  func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
-    
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.width = textureWidth
-    textureDesc.height = textureHeight
-    textureDesc.depth = 1
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.pixelFormat = .rgba32Float
-    textureDesc.textureType = .type2DArray
-    textureDesc.storageMode = .shared
-    textureDesc.cpuCacheMode = .defaultCache
-    textureDesc.arrayLength = arrayLength
-    let texture = makeTexture(descriptor: textureDesc)!
     
-    if value.count >= 4{
-      let counts = arrayLength * 4 * textureWidth * textureHeight
-      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
-      for i in 0..<value.count {
-        pointer[i] = value[i]
-      }
-      for i in value.count..<counts {
-        pointer[i] = 0 as! P
-      }
-      
-      let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
-      let bytesPerImage = texture.height * bytesPerRow
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
-      for i in 0..<arrayLength {
-        let p = pointer + texture.width * texture.height * 4 * i
-        texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
-      }
-    } else {
-      
+    func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
+        
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.width = textureWidth
+        textureDesc.height = textureHeight
+        textureDesc.depth = 1
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.pixelFormat = .rgba32Float
+        textureDesc.textureType = .type2DArray
+        textureDesc.storageMode = .shared
+        textureDesc.cpuCacheMode = .defaultCache
+        textureDesc.arrayLength = arrayLength
+        let texture = makeTexture(descriptor: textureDesc)!
+        
+        if value.count >= 4{
+            let counts = arrayLength * 4 * textureWidth * textureHeight
+            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
+            for i in 0..<value.count {
+                pointer[i] = value[i]
+            }
+            for i in value.count..<counts {
+                pointer[i] = 0 as! P
+            }
+            
+            let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
+            let bytesPerImage = texture.height * bytesPerRow
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
+            for i in 0..<arrayLength {
+                let p = pointer + texture.width * texture.height * 4 * i
+                texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
+            }
+        } else {
+            
+        }
+        
+        return texture
     }
-    
-    return texture
-  }
 }
 
 extension MTLComputeCommandEncoder {
-  public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
-    let slices = (outTexture.arrayLength * 4 + 3)/4
-    
-    let width = computePipline.threadExecutionWidth
-    let height = computePipline.maxTotalThreadsPerThreadgroup/width
-    let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
-    
-//    print(" thread: threads per group: \(threadsPerGroup) ")
-//    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
-    
-    let groupWidth = (outTexture.width + width - 1)/width
-    let groupHeight = (outTexture.height + height - 1)/height
-    let groupDepth = slices
-    let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
-    
-    setComputePipelineState(computePipline)
-    
-    dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
-  }
+    public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
+        let slices = (outTexture.arrayLength * 4 + 3)/4
+        
+        let width = computePipline.threadExecutionWidth
+        let height = computePipline.maxTotalThreadsPerThreadgroup/width
+        let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
+        
+        //    print(" thread: threads per group: \(threadsPerGroup) ")
+        //    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
+        
+        let groupWidth = (outTexture.width + width - 1)/width
+        let groupHeight = (outTexture.height + height - 1)/height
+        let groupDepth = slices
+        let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
+        
+        setComputePipelineState(computePipline)
+        
+        dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
+    }
 }
 
 public extension MTLTexture {
-  
-  func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
-    var arr: [P] = floatArray { (p: P) -> P in
-      return p;
-    }
-    var result:  [(index: Int, value: P)] = []
-    if arr.count > 100 && stridable {
-      for j in stride(from: 0, to: arr.count , by: arr.count / 100){
-        result.append((j, arr[j]))
-      }
-    } else {
-      for j in 0..<arr.count {
-        result.append((j, arr[j]))
-      }
-    }
-    return result
-  }
-  
-  func floatArray<P, T>(res: (P) -> T) -> [T] {
-    var fArr: [T] = []
-    if textureType == .type2DArray {
-      for i in 0..<arrayLength{
-        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-        let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-        let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
-        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-        let p = bytes.assumingMemoryBound(to: P.self)
-        
-        for j in 0..<width * height * depth * 4 {
-          fArr.append(res(p[j]))
-        }
-        bytes.deallocate()
-      }
-    } else if textureType == .type2D {
-      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-      let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-      let p = bytes.assumingMemoryBound(to: P.self)
-      
-      for j in 0..<width * height * 4 {
-        fArr.append(res(p[j]))
-      }
-      bytes.deallocate()
+    
+    func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
+        var arr: [P] = floatArray { (p: P) -> P in
+            return p;
+        }
+        var result:  [(index: Int, value: P)] = []
+        if arr.count > 100 && stridable {
+            for j in stride(from: 0, to: arr.count , by: arr.count / 100){
+                result.append((j, arr[j]))
+            }
+        } else {
+            for j in 0..<arr.count {
+                result.append((j, arr[j]))
+            }
+        }
+        return result
     }
-    return fArr
-  }
-  
-  func float32Array() -> [Float32] {
-    if pixelFormat == .rgba32Float {
-      let float32Array = floatArray { (f: Float32) -> Float32 in
-        return f
-      }
-      return float32Array
-    } else if pixelFormat == .rgba16Float {
-      
-      var float16Array = floatArray { (f: Float16) -> Float16 in
-        return f
-      }
-      return float16To32(input: &float16Array, count: float16Array.count)
-    } else {
-      fatalError()
+    
+    func floatArray<P, T>(res: (P) -> T) -> [T] {
+        var fArr: [T] = []
+        if textureType == .type2DArray {
+            for i in 0..<arrayLength{
+                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+                let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+                let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
+                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+                let p = bytes.assumingMemoryBound(to: P.self)
+                
+                for j in 0..<width * height * depth * 4 {
+                    fArr.append(res(p[j]))
+                }
+                bytes.deallocate()
+            }
+        } else if textureType == .type2D {
+            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+            let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+            let p = bytes.assumingMemoryBound(to: P.self)
+            
+            for j in 0..<width * height * 4 {
+                fArr.append(res(p[j]))
+            }
+            bytes.deallocate()
+        }
+        return fArr
     }
-  }
-  
-  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-    print(header)
-    print("texture: \(self)")
-    //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
-    //        print(res)
     
-    if textureType == .type2DArray {
-      for i in 0..<arrayLength{
-        var str: String = "slice: \(i): \n"
-        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-        let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-        let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
-        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-        let p = bytes.assumingMemoryBound(to: T.self)
-        str += "2d array count : \(width * height * depth * 4) \n"
-        if stridable && width * height * depth * 4 > 20 {
-          for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
-            str += " index \(j): \(p[j])"
-          }
+    func float32Array() -> [Float32] {
+        if pixelFormat == .rgba32Float {
+            let float32Array = floatArray { (f: Float32) -> Float32 in
+                return f
+            }
+            return float32Array
+        } else if pixelFormat == .rgba16Float {
+            
+            var float16Array = floatArray { (f: Float16) -> Float16 in
+                return f
+            }
+            return float16To32(input: &float16Array, count: float16Array.count)
         } else {
-          for j in 0..<width * height * depth * 4 {
-            str += " index \(j): \(p[j])"
-          }
+            fatalError()
         }
-        
-        bytes.deallocate()
-        print(str)
-      }
-    } else if textureType == .type2D {
-      var str: String = "texture 2D: "
-      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-      let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-      let p = bytes.assumingMemoryBound(to: T.self)
-      str += "2d count : \(width * width * 4) \n"
-      
-      if stridable {
-        for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
-          str += "index \(j): \(p[j]) "
-        }
-      } else {
-        for j in 0..<width * height * 4 {
-          str += "index \(j): \(p[j]) "
-        }
-      }
-      
-      print(str)
-      bytes.deallocate()
     }
-    return nil
     
-  }
-  
-  // n c h w - dim
-  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
-    var textureArray: [Float32]
-    if pixelFormat == .rgba32Float {
-      textureArray = floatArray { (i : Float32) -> Float32 in
-        return i
-      }
-    } else if pixelFormat == .rgba16Float {
-      
-      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
-        return i
-      }
-      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
-    } else {
-      fatalError(" 目前还不支持其他类型 ")
-    }
-    print(textureArray.count)
-    var output: [Float32] = []
-    for s in 0..<arrayLength {
-      for c in 0..<4{
-        for h in 0..<dim.h {
-          for w in 0..<dim.w {
-            if (s * 4 + c) < dim.c {
-              let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
-              output.append(textureValue)
+    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+        print(header)
+        print("texture: \(self)")
+        //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
+        //        print(res)
+        
+        if textureType == .type2DArray {
+            for i in 0..<arrayLength{
+                var str: String = "slice: \(i): \n"
+                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
+                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+                let p = bytes.assumingMemoryBound(to: T.self)
+                str += "2d array count : \(width * height * depth * 4) \n"
+                if stridable && width * height * depth * 4 > 20 {
+                    for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
+                        str += " index \(j): \(p[j])"
+                    }
+                } else {
+                    for j in 0..<width * height * depth * 4 {
+                        str += " index \(j): \(p[j])"
+                    }
+                }
+                
+                bytes.deallocate()
+                print(str)
             }
-          }
+        } else if textureType == .type2D {
+            var str: String = "texture 2D: "
+            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+            let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+            let p = bytes.assumingMemoryBound(to: T.self)
+            str += "2d count : \(width * width * 4) \n"
+            
+            if stridable {
+                for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
+                    str += "index \(j): \(p[j]) "
+                }
+            } else {
+                for j in 0..<width * height * 4 {
+                    str += "index \(j): \(p[j]) "
+                }
+            }
+            
+            print(str)
+            bytes.deallocate()
         }
-      }
+        return nil
+        
     }
-    return output
-  }
-  
-  func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
-//    print("origin dim: \(dim)")
-//    print("texture: ")
-//    print(self)
     
-    var textureArray: [Float32]
-    if pixelFormat == .rgba32Float {
-      textureArray = floatArray { (i : Float32) -> Float32 in
-        return i
-      }
-    } else if pixelFormat == .rgba16Float {
-      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
-        return i
-      }
-      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
-    } else {
-      fatalError(" 目前还不支持其他类型 ")
+    // n c h w - dim
+    func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
+        var textureArray: [Float32]
+        if pixelFormat == .rgba32Float {
+            textureArray = floatArray { (i : Float32) -> Float32 in
+                return i
+            }
+        } else if pixelFormat == .rgba16Float {
+            
+            var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+                return i
+            }
+            textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+        } else {
+            fatalError(" 目前还不支持其他类型 ")
+        }
+        print(textureArray.count)
+        var output: [Float32] = []
+        for s in 0..<arrayLength {
+            for c in 0..<4{
+                for h in 0..<dim.h {
+                    for w in 0..<dim.w {
+                        if (s * 4 + c) < dim.c {
+                            let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
+                            output.append(textureValue)
+                        }
+                    }
+                }
+            }
+        }
+        return output
     }
     
-    var output: [Float32] = []
-    let numOfASlice = dim.h * dim.w * 4
-    for h in 0..<dim.h {
-      for w in 0..<dim.w {
-        for sliceIndex in 0..<arrayLength {
-          if sliceIndex * 4 + 4 > dim.c {
-            for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) {
-              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
-              output.append(value)
-            }
-          } else {
-            for i in 0..<4 {
-              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
-              output.append(value)
-            }
-          }
-        }
-      }
+    func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
+        //    print("origin dim: \(dim)")
+        //    print("texture: ")
+        //    print(self)
+        
+        var textureArray: [Float32]
+        if pixelFormat == .rgba32Float {
+            textureArray = floatArray { (i : Float32) -> Float32 in
+                return i
+            }
+        } else if pixelFormat == .rgba16Float {
+            var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+                return i
+            }
+            textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+        } else {
+            fatalError(" 目前还不支持其他类型 ")
+        }
+        
+        var output: [Float32] = []
+        let numOfASlice = dim.h * dim.w * 4
+        for h in 0..<dim.h {
+            for w in 0..<dim.w {
+                for sliceIndex in 0..<arrayLength {
+                    if sliceIndex * 4 + 4 > dim.c {
+                        for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) {
+                            let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+                            output.append(value)
+                        }
+                    } else {
+                        for i in 0..<4 {
+                            let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+                            output.append(value)
+                        }
+                    }
+                }
+            }
+        }
+        return output
     }
-    return output
-  }
-  
+    
 }
 
 
 public extension MTLBuffer {
-  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-    print(header)
-    print("MTLBuffer: \(self) ")
-    var str = ""
-    if stridable && length/MemoryLayout<T>.stride > 1000{
-      for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
-        str += " \(contents().assumingMemoryBound(to: T.self)[j])"
-      }
-    } else {
-      for i in 0..<length/MemoryLayout<T>.size {
-        str += " \(contents().assumingMemoryBound(to: T.self)[i])"
-      }
+    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+        print(header)
+        print("MTLBuffer: \(self) ")
+        var str = ""
+        if stridable && length/MemoryLayout<T>.stride > 1000{
+            for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
+                str += " \(contents().assumingMemoryBound(to: T.self)[j])"
+            }
+        } else {
+            for i in 0..<length/MemoryLayout<T>.size {
+                str += " \(contents().assumingMemoryBound(to: T.self)[i])"
+            }
+        }
+        print(str)
+        return nil
     }
-    print(str)
-    return nil
-  }
-  
-  func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.width = textureWidth
-    textureDesc.height = textureHeight
-    textureDesc.depth = 1
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.pixelFormat = .rgba32Float
-    textureDesc.textureType = .type2DArray
-    textureDesc.storageMode = .shared
-    textureDesc.cpuCacheMode = .defaultCache
-    textureDesc.arrayLength = arrayLength
-    let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
-    return texture
-  }
-  
-  func array<T>() -> [T] {
-    var array: [T] = []
-    let pointer = contents().bindMemory(to: T.self, capacity: length)
-    for i in 0..<(length / MemoryLayout<T>.size) {
-      array.append(pointer[i])
+    
+    func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.width = textureWidth
+        textureDesc.height = textureHeight
+        textureDesc.depth = 1
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.pixelFormat = .rgba32Float
+        textureDesc.textureType = .type2DArray
+        textureDesc.storageMode = .shared
+        textureDesc.cpuCacheMode = .defaultCache
+        textureDesc.arrayLength = arrayLength
+        let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
+        return texture
+    }
+    
+    func array<T>() -> [T] {
+        var array: [T] = []
+        let pointer = contents().bindMemory(to: T.self, capacity: length)
+        for i in 0..<(length / MemoryLayout<T>.size) {
+            array.append(pointer[i])
+        }
+        return array;
     }
-    return array;
-  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
index 724a44b0f4..52c27ccead 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
@@ -89,135 +89,135 @@ public class PaddleMobileUnitTest {
     }
     
     public func testConcat() {
-//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-//        var it: [[Float32]] = []
-//        for _ in 0..<7 {
-//            it.append((0..<12).map { Float32($0) })
-//        }
-//        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
-//        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
-//
-//        let param = ConcatTestParam.init(
-//            input: input,
-//            output: output,
-//            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
-//            axis: 1,
-//            odim: [3, 28]
-//        )
-//        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
-//        concatKernel.test(cmdBuffer: buffer, param: param)
-//        buffer.addCompletedHandler { (buffer) in
-//            for i in 0..<it.count {
-//                let _: Float32? = input[i].logDesc()
-//                self.tensorPrint(tensor: it[i], dim: [3, 4])
-//            }
-//            let _: Float32? = output.logDesc()
-//            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
-//            self.tensorPrint(tensor: tx, dim: [3, 28])
-//        }
-//
-//        buffer.commit()
+        //        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+        //        var it: [[Float32]] = []
+        //        for _ in 0..<7 {
+        //            it.append((0..<12).map { Float32($0) })
+        //        }
+        //        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
+        //        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
+        //
+        //        let param = ConcatTestParam.init(
+        //            input: input,
+        //            output: output,
+        //            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
+        //            axis: 1,
+        //            odim: [3, 28]
+        //        )
+        //        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
+        //        concatKernel.test(cmdBuffer: buffer, param: param)
+        //        buffer.addCompletedHandler { (buffer) in
+        //            for i in 0..<it.count {
+        //                let _: Float32? = input[i].logDesc()
+        //                self.tensorPrint(tensor: it[i], dim: [3, 4])
+        //            }
+        //            let _: Float32? = output.logDesc()
+        //            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
+        //            self.tensorPrint(tensor: tx, dim: [3, 28])
+        //        }
+        //
+        //        buffer.commit()
     }
     
     public func testReshape() {
-//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-//        let input: [Float32] = (0..<24).map { Float32($0) }
-//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-//        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
-//        let mp = ReshapeMetalParam.init(
-//            idim: (1, 2, 3, 4),
-//            itrans: (0, 1, 2, 3),
-//            odim: (1, 1, 4, 6),
-//            otrans: (0, 1, 2, 3)
-//        )
-//        let param = ReshapeTestParam.init(
-//            inputTexture: inTexture,
-//            outputTexture: outTexture,
-//            param: mp
-//        )
-//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
-//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inTexture.logDesc()
-//            let _: Float32? = outTexture.logDesc()
-//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
-//            self.tensorPrint(tensor: tx, dim: [4, 6])
-//        }
+        //        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+        //        let input: [Float32] = (0..<24).map { Float32($0) }
+        //        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+        //        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
+        //        let mp = ReshapeMetalParam.init(
+        //            idim: (1, 2, 3, 4),
+        //            itrans: (0, 1, 2, 3),
+        //            odim: (1, 1, 4, 6),
+        //            otrans: (0, 1, 2, 3)
+        //        )
+        //        let param = ReshapeTestParam.init(
+        //            inputTexture: inTexture,
+        //            outputTexture: outTexture,
+        //            param: mp
+        //        )
+        //        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+        //        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inTexture.logDesc()
+        //            let _: Float32? = outTexture.logDesc()
+        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
+        //            self.tensorPrint(tensor: tx, dim: [4, 6])
+        //        }
         
-//        let input: [Float32] = (0..<24).map { Float32($0) }
-//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-//        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
-//        let mp = ReshapeMetalParam.init(
-//            idim: (1, 2, 3, 4),
-//            itrans: (0, 1, 2, 3),
-//            odim: (1, 1, 1, 24),
-//            otrans: (0, 1, 2, 3)
-//        )
-//        let param = ReshapeTestParam.init(
-//            inputTexture: inTexture,
-//            outputTexture: outTexture,
-//            param: mp
-//        )
-//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
-//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inTexture.logDesc()
-//            let _: Float32? = outTexture.logDesc()
-//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
-//            self.tensorPrint(tensor: tx, dim: [24])
-//        }
-//
-//        
-//        buffer.commit()
+        //        let input: [Float32] = (0..<24).map { Float32($0) }
+        //        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+        //        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
+        //        let mp = ReshapeMetalParam.init(
+        //            idim: (1, 2, 3, 4),
+        //            itrans: (0, 1, 2, 3),
+        //            odim: (1, 1, 1, 24),
+        //            otrans: (0, 1, 2, 3)
+        //        )
+        //        let param = ReshapeTestParam.init(
+        //            inputTexture: inTexture,
+        //            outputTexture: outTexture,
+        //            param: mp
+        //        )
+        //        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+        //        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inTexture.logDesc()
+        //            let _: Float32? = outTexture.logDesc()
+        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
+        //            self.tensorPrint(tensor: tx, dim: [24])
+        //        }
+        //
+        //        
+        //        buffer.commit()
     }
     
     public func testTranspose() {
-
+        
         let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-//        var input: [Float32] = []
-//        for i in 0..<72 {
-//            input.append(Float32(i))
-//        }
-////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
-//        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
-//        // group 1
-//        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
-//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
-////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
-////        // group 2
-////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
-////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
-////
-//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
-//
-//        transposeKernel.test(commandBuffer: buffer, param: param)
-//
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
-//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-//            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
-//            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
-//        }
-//
-//        let input: [Float32] = (0..<24).map { Float32($0) }
-//        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-//        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
-//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
-//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
-//
-//        transposeKernel.test(commandBuffer: buffer, param: param)
-//
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
-//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
-//            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
-//        }
-//        
+        //        var input: [Float32] = []
+        //        for i in 0..<72 {
+        //            input.append(Float32(i))
+        //        }
+        ////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
+        //        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
+        //        // group 1
+        //        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
+        //        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
+        ////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
+        ////        // group 2
+        ////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
+        ////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
+        ////
+        //        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+        //
+        //        transposeKernel.test(commandBuffer: buffer, param: param)
+        //
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+        //            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+        //            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
+        //            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
+        //        }
+        //
+        //        let input: [Float32] = (0..<24).map { Float32($0) }
+        //        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+        //        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
+        //        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
+        //        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+        //
+        //        transposeKernel.test(commandBuffer: buffer, param: param)
+        //
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+        //            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
+        //            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
+        //        }
+        //        
         buffer.commit()
     }
     
@@ -225,72 +225,72 @@ public class PaddleMobileUnitTest {
         let buffer = queue.makeCommandBuffer() ?! " buffer is nil "
         
         let input: [Float32] = [
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-        ]
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            ]
         
         let filter: [Float32] = [
-        //1.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        //2.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        //3.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        //4.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        ]
+            //1.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            //2.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            //3.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            //4.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            ]
         
         let biase: [Float32] = [1.0, 1.0, 1.0, 100.0]
         let newScalue: [Float32] = [1.0, 1.0, 1.0, 1.0]
@@ -324,10 +324,10 @@ public class PaddleMobileUnitTest {
         
         let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
         
-      let initContext = InitContext.init()
-      initContext.metalLoadMode = .LoadMetalInDefaultLib
+        let initContext = InitContext.init()
+        initContext.metalLoadMode = .LoadMetalInDefaultLib
         
-      let convAddBnReluKernel = ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param, initContext: initContext)
+        let convAddBnReluKernel = ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param, initContext: initContext)
         
         convAddBnReluKernel.test(commandBuffer: buffer, param: param)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
index ae7b898a8e..701bb37bf2 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
@@ -16,222 +16,222 @@ import Foundation
 import Accelerate
 
 public protocol SummableMultipliable: Equatable {
-  static func +(lhs: Self, rhs: Self) -> Self
-  static func *(lhs: Self, rhs: Self) -> Self
-  static func -(lhs: Self, rhs: Self) -> Self
+    static func +(lhs: Self, rhs: Self) -> Self
+    static func *(lhs: Self, rhs: Self) -> Self
+    static func -(lhs: Self, rhs: Self) -> Self
 }
 public protocol PrecisionType: SummableMultipliable{
-  init(inFloat: Float32)
-  init(inFloat16: Float16)
-  init<P: PrecisionType>(_ inP: P)
-  static var bitSize: UInt { get }
+    init(inFloat: Float32)
+    init(inFloat16: Float16)
+    init<P: PrecisionType>(_ inP: P)
+    static var bitSize: UInt { get }
 }
 
 public typealias Float16 = Int16
 extension Float16: PrecisionType {
-  public static func * (prefix: Float16, postfix: Float16) {
-    return prefix * postfix
-  }
-  
-  public init<P>(_ inP: P) where P : PrecisionType {
-    if P.bitSize == Float32.bitSize {
-      self = Float16(inFloat: inP as! Float32)
-    } else if P.bitSize == Float16.bitSize {
-      self = inP as! Float16
-    } else {
-      fatalError()
+    public static func * (prefix: Float16, postfix: Float16) {
+        return prefix * postfix
+    }
+    
+    public init<P>(_ inP: P) where P : PrecisionType {
+        if P.bitSize == Float32.bitSize {
+            self = Float16(inFloat: inP as! Float32)
+        } else if P.bitSize == Float16.bitSize {
+            self = inP as! Float16
+        } else {
+            fatalError()
+        }
+    }
+    
+    public static var bitSize: UInt {
+        return 16
+    }
+    
+    public init(inFloat16: Float16) {
+        self = inFloat16
+    }
+    public init(inFloat: Float32) {
+        self = Int16(inFloat)
     }
-  }
-  
-  public static var bitSize: UInt {
-    return 16
-  }
-  
-  public init(inFloat16: Float16) {
-    self = inFloat16
-  }
-  public init(inFloat: Float32) {
-    self = Int16(inFloat)
-  }
 }
 
 extension Float32: PrecisionType {
-  public init<P>(_ inP: P) where P : PrecisionType {
-    if P.bitSize == Float32.bitSize {
-      self = inP as! Float32
-    } else if P.bitSize == Float16.bitSize {
-      self = Float32.init(inP as! Float16)
-    } else {
-      fatalError()
+    public init<P>(_ inP: P) where P : PrecisionType {
+        if P.bitSize == Float32.bitSize {
+            self = inP as! Float32
+        } else if P.bitSize == Float16.bitSize {
+            self = Float32.init(inP as! Float16)
+        } else {
+            fatalError()
+        }
+    }
+    
+    public init(inFloat: Float32) {
+        self = inFloat
+    }
+    
+    public init(inFloat16: Float16) {
+        self = Float32.init(inFloat16)
+    }
+    
+    public static var bitSize: UInt {
+        return 32
     }
-  }
-  
-  public init(inFloat: Float32) {
-    self = inFloat
-  }
-  
-  public init(inFloat16: Float16) {
-    self = Float32.init(inFloat16)
-  }
-  
-  public static var bitSize: UInt {
-    return 32
-  }
 }
 
 public func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
-  var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
-  var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
-  guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
-    fatalError(" float 32 to float 16 error ! ")
-  }
+    var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+    var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+    guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+        fatalError(" float 32 to float 16 error ! ")
+    }
 }
 
 public func float16To32(input: UnsafeMutablePointer<Float16>, count: Int) -> [Float32] {
-  var output = Array<Float>.init(repeating: 0.0, count: count)
-  float16to32(input: input, output: &output, count: count)
-  return output
+    var output = Array<Float>.init(repeating: 0.0, count: count)
+    float16to32(input: input, output: &output, count: count)
+    return output
 }
 
 public func float16to32(input: UnsafeMutablePointer<Float16>, output: UnsafeMutablePointer<Float32>, count: Int) {
-  var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
-  var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
-  if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
-    fatalError(" convert float16 to float32 error")
-  }
+    var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
+    var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
+    if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
+        fatalError(" convert float16 to float32 error")
+    }
 }
 
 // N - 0   C - 1   H - 2   W - 3
 struct DataLayout {
-  
-  static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
-    return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
-  }
-  
-  static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
-    return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
-  }
-  
-  func count() -> Int {
-    return layoutWithDim.count
-  }
-  
-  var N: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .N {
-          return layoutDim.1
-        }
-      }
-      return nil
+    
+    static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+        return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
     }
-    set {
-      var newN = (Layout.N, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .N
-      }) {
-        fatalError()
-      }
+    
+    static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+        return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
     }
-  }
-  var C: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .C {
-          return layoutDim.1
-        }
-      }
-      return nil
+    
+    func count() -> Int {
+        return layoutWithDim.count
     }
-    set {
-      var newN = (Layout.C, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .N
-      }) {
-        fatalError()
-      }
+    
+    var N: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .N {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.N, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .N
+            }) {
+                fatalError()
+            }
+        }
     }
-  }
-  var H: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .H {
-          return layoutDim.1
+    var C: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .C {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.C, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .N
+            }) {
+                fatalError()
+            }
         }
-      }
-      return nil
     }
-    set {
-      var newN = (Layout.H, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .H
-      }) {
-        fatalError()
-      }
+    var H: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .H {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.H, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .H
+            }) {
+                fatalError()
+            }
+        }
     }
-  }
-  var W: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .W {
-          return layoutDim.1
+    var W: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .W {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.W, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .W
+            }) {
+                fatalError()
+            }
         }
-      }
-      return nil
     }
-    set {
-      var newN = (Layout.W, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .W
-      }) {
-        fatalError()
-      }
+    
+    
+    init(_ inLayout: [(Layout, Int)]) {
+        layoutWithDim = inLayout
     }
-  }
-  
-  
-  init(_ inLayout: [(Layout, Int)]) {
-    layoutWithDim = inLayout
-  }
-  
-  func layout() -> [Layout] {
-    return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
-      return layout
-    })
-  }
-  
-  var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
-  
-  func convertTo(inLayout: [Layout]) {
     
-  }
-  
-  enum Layout: Int{
-    case N = 0
-    case C = 1
-    case H = 2
-    case W = 3
-    static func defaultLayout() -> [Layout] {
-      return [N, C, H, W]
+    func layout() -> [Layout] {
+        return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
+            return layout
+        })
+    }
+    
+    var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
+    
+    func convertTo(inLayout: [Layout]) {
+        
+    }
+    
+    enum Layout: Int{
+        case N = 0
+        case C = 1
+        case H = 2
+        case W = 3
+        static func defaultLayout() -> [Layout] {
+            return [N, C, H, W]
+        }
     }
-  }
 }
 
 extension DataLayout: Equatable {
-  public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
-    if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
-      var result = true
-      for i in 0..<lhs.layoutWithDim.count {
-        result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
-        if !result {
-          break
+    public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
+        if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
+            var result = true
+            for i in 0..<lhs.layoutWithDim.count {
+                result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
+                if !result {
+                    break
+                }
+            }
+            return result
+        } else {
+            return false
         }
-      }
-      return result
-    } else {
-      return false
     }
-  }
 }
 
 public protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
@@ -253,42 +253,42 @@ extension MTLTexture where Self: Variant {
 }
 
 public class FetchHolder: Variant {
-  var resultBuffer: MTLBuffer?
-  public var dim: Dim
-  public var capacity: Int
-  public var paddedCapacity: Int
-  
-  init(inPaddedCapacity: Int, inDim: Dim) {
-    paddedCapacity = inPaddedCapacity
-    capacity = inDim.numel()
-    dim = inDim
-  }
-  
-  public func initBuffer(device: MTLDevice) {
-    resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: [])
-  }
-  
-  var result: UnsafeMutablePointer<Float32> {
-    guard let inResultBuffer = resultBuffer else {
-      fatalError()
+    var resultBuffer: MTLBuffer?
+    public var dim: Dim
+    public var capacity: Int
+    public var paddedCapacity: Int
+    
+    init(inPaddedCapacity: Int, inDim: Dim) {
+        paddedCapacity = inPaddedCapacity
+        capacity = inDim.numel()
+        dim = inDim
     }
-    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity)
-  }
-  
+    
+    public func initBuffer(device: MTLDevice) {
+        resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: [])
+    }
+    
+    var result: UnsafeMutablePointer<Float32> {
+        guard let inResultBuffer = resultBuffer else {
+            fatalError()
+        }
+        return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity)
+    }
+    
 }
 
 extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
-  public var description: String {
-    fatalError()
-//    return "\(result)"
-  }
-  
-  public var debugDescription: String {
-    fatalError()
-//    return "\(result)"
-  }
-  
-  
+    public var description: String {
+        fatalError()
+        //    return "\(result)"
+    }
+    
+    public var debugDescription: String {
+        fatalError()
+        //    return "\(result)"
+    }
+    
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
index 1817184bf7..77b67bf16c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 @objc public class Dim: NSObject {
-  private(set) var dims: [Int]
-  
-  @objc public init(inDim: [Int]) {
-    dims = inDim
-  }
-  
-  public func cout() -> Int {
-    return dims.count
-  }
-  
-  public func numel() -> Int {
-    return dims.reduce(1) { $0 * $1 }
-  }
-  
-  public static func ==(left: Dim, right: Dim) -> Bool {
-    return left.dims == right.dims;
-  }
-  
-  public static func !=(left: Dim, right: Dim) -> Bool {
-    return left.dims != right.dims;
-  }
-  
-  public subscript(index: Int) -> Int {
-    return dims[index];
-  }
-  
-  public override var description: String {
-    return "\(dims)"
-  }
-  
-  func swapeDimAt(index1: Int, index2: Int) {
-    dims.swapAt(index1, index2)
-  }
-  
-  private override init(){
-    fatalError()
-  }
+    private(set) var dims: [Int]
+    
+    @objc public init(inDim: [Int]) {
+        dims = inDim
+    }
+    
+    public func cout() -> Int {
+        return dims.count
+    }
+    
+    public func numel() -> Int {
+        return dims.reduce(1) { $0 * $1 }
+    }
+    
+    public static func ==(left: Dim, right: Dim) -> Bool {
+        return left.dims == right.dims;
+    }
+    
+    public static func !=(left: Dim, right: Dim) -> Bool {
+        return left.dims != right.dims;
+    }
+    
+    public subscript(index: Int) -> Int {
+        return dims[index];
+    }
+    
+    public override var description: String {
+        return "\(dims)"
+    }
+    
+    func swapeDimAt(index1: Int, index2: Int) {
+        dims.swapAt(index1, index2)
+    }
+    
+    private override init(){
+        fatalError()
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
index 8f02bf17bc..9f257200b1 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
@@ -14,136 +14,141 @@
 
 import Foundation
 
-
 let testTo = 5
 
 var isTest = false
 
 @objc public class GPUResultHolder: NSObject{
-  @objc public let dim: [Int]
-  @objc public let capacity: Int
-  @objc public var resultPointer: UnsafeMutablePointer<Float32>?
-  @objc public var intermediateResults: [String : [MTLBuffer]]?
-  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) {
-    dim = inDim
-    capacity = inCapacity
+    @objc public let dim: [Int]
+    @objc public let capacity: Int
+    @objc public var resultPointer: UnsafeMutablePointer<Float32>?
+    @objc public var intermediateResults: [String : [MTLBuffer]]?
+    public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) {
+        dim = inDim
+        capacity = inCapacity
+        
+        if let inInPointer = inPointer {
+            resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
+            resultPointer?.initialize(from: inInPointer, count: inCapacity)
+        }
+        
+        intermediateResults = inIntermediateResults
+    }
     
-    if let inInPointer = inPointer {
-      resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
-      resultPointer?.initialize(from: inInPointer, count: inCapacity)
+    public override var description: String {
+        fatalError()
     }
     
-    intermediateResults = inIntermediateResults
-  }
-  
-  public override var description: String {
-    fatalError()
-  }
-  
 }
 
 public class Executor<P: PrecisionType> {
-  var ops: [Runable & InferShaperable] = []
-  var preInputDim: Dim = Dim.init(inDim: [])
-  let program: Program
-  let device: MTLDevice
-  let inflightSemaphore: DispatchSemaphore
-  let queue: MTLCommandQueue
-  init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws {
-    self.inflightSemaphore = DispatchSemaphore(value: 1)
-    program = inProgram
-    device = inDevice
-    queue = inQueue
-    
-    for block in inProgram.programDesc.blocks {
-      //block.ops.count
-      for i in 0..<block.ops.count {
-        let opDesc = block.ops[i]
-        do {
-          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext)
-          ops.append(op)
-        } catch let error {
-          throw error
+    var ops: [Runable & InferShaperable] = []
+    var preInputDim: Dim = Dim.init(inDim: [])
+    let program: Program
+    let device: MTLDevice
+    let inflightSemaphore: DispatchSemaphore
+    let queue: MTLCommandQueue
+    init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws {
+        self.inflightSemaphore = DispatchSemaphore(value: 1)
+        program = inProgram
+        device = inDevice
+        queue = inQueue
+        
+        for block in inProgram.programDesc.blocks {
+            //block.ops.count
+            for i in 0..<block.ops.count {
+                let opDesc = block.ops[i]
+                do {
+                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext)
+                    ops.append(op)
+                } catch let error {
+                    throw error
+                }
+            }
         }
-      }
-    }
-  }
-  
-  public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
-    inflightSemaphore.wait()
-
-    guard let buffer = queue.makeCommandBuffer() else {
-      throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
-    }
-    
-    let resInput: MTLTexture
-    if let inPre = preProcessKernle {
-      do {
-        try inPre.compute(inputTexuture: input, commandBuffer: buffer)
-        resInput = inPre.outputTexture
-      } catch let error {
-        throw error
-      }
-    } else {
-      resInput = input
     }
     
-    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim)
-    program.scope.setInput(input: inputTexture)
-    //(ops.count - except)
-    for i in 0..<(ops.count - except) {
-      let op = ops[i]
-      do {
-        try op.run(device: device, buffer: buffer)
-      } catch let error {
-        throw error
-      }
-    }
-    
-    var outputTextures: [String : [MTLBuffer]]?
-    if except > 0 {
-      ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
-      outputTextures = ops[ops.count - except].inputVariant()
+    public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+        inflightSemaphore.wait()
+        
+        guard let buffer = queue.makeCommandBuffer() else {
+            throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
+        }
+        
+        let resInput: MTLTexture
+        if let inPre = preProcessKernle {
+            do {
+                try inPre.compute(inputTexuture: input, commandBuffer: buffer)
+                resInput = inPre.outputTexture
+            } catch let error {
+                throw error
+            }
+        } else {
+            resInput = input
+        }
+        
+        let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim)
+        program.scope.setInput(input: inputTexture)
+        //(ops.count - except)
+        for i in 0..<(ops.count - except) {
+            let op = ops[i]
+            do {
+                try op.run(device: device, buffer: buffer)
+            } catch let error {
+                throw error
+            }
+        }
+        
+        var outputTextures: [String : [MTLBuffer]]?
+        if except > 0 {
+            ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
+            outputTextures = ops[ops.count - except].inputVariant()
+        }
+        
+        buffer.addCompletedHandler { [weak self] (commandbuffer) in
+            guard let SSelf = self else {
+                fatalError()
+            }
+            
+            //将输入写进文件
+            /*
+             
+             let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+             print(dim)
+             writeToLibrary(fileName: "yolo_input", array: inputArr)
+             print(" write done ")
+             return
+             */
+            
+            
+            
+            //输出 op 计算结果
+            if GlobalConfig.shared.debug {
+                for i in 0..<SSelf.ops.count {
+                    print("第 \(i) 个 op: " )
+                    let op = SSelf.ops[i]
+                    op.delogOutput()
+                }
+            }
+            
+            var resultHolder: GPUResultHolder
+            if except > 0 {
+                resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0,  inIntermediateResults: outputTextures)
+            } else {
+                let outputVar: Variant = SSelf.program.scope.output()!
+                let output: FetchHolder = outputVar as! FetchHolder
+                resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity)
+            }
+            
+            completionHandle([resultHolder])
+            SSelf.inflightSemaphore.signal()
+        }
+        
+        buffer.commit()
     }
     
-    buffer.addCompletedHandler { [weak self] (commandbuffer) in
-      guard let SSelf = self else {
-        fatalError()
-      }
-            
-      //将输入写进文件
-      /*
-       let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-       print(dim)
-       writeToLibrary(fileName: "test_image_super", array: inputArr)
-       print(" write done ")
-       return
-       */
-      
-      /*    输出 op 计算结果
-       for op in SSelf.ops {
-       op.delogOutput()
-       }
-       */
-      
-      var resultHolder: GPUResultHolder
-      if except > 0 {
-        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0,  inIntermediateResults: outputTextures)
-      } else {
-        let outputVar: Variant = SSelf.program.scope.output()!
-        let output: FetchHolder = outputVar as! FetchHolder
-        resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity)
-      }
-      
-      completionHandle([resultHolder])
-      SSelf.inflightSemaphore.signal()
+    public func clear() {
+        program.scope.clear()
     }
     
-    buffer.commit()
-  }
-  
-  public func clear() {
-    program.scope.clear()
-  }
-  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
index 1d4f0ec14f..790b961480 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
@@ -16,251 +16,251 @@ import Foundation
 //import SwiftProtobuf
 
 public class Loader<P: PrecisionType> {
-  class ParaLoader {
-    let file: UnsafeMutablePointer<FILE>
-    let fileSize: Int
-    var nowIndex: Int
-    init(paramPath: String) throws {
-      guard let tmpFile = fopen(paramPath, "rb") else {
-        throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
-      }
-      file = tmpFile
-      fseek(file, 0, SEEK_END)
-      fileSize = ftell(file)
-      guard fileSize > 0 else {
-        throw PaddleMobileError.loaderError(message: "param file size is too small")
-      }
-      rewind(file)
-      nowIndex = 0
-    }
-    
-    func read(tensor: Tensor<P>) throws {
-      guard nowIndex <= fileSize else {
-        throw PaddleMobileError.loaderError(message: "out of the file range")
-      }
-      
-      func pointerReader<T>(type: T.Type) -> T {
-        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
-        fread(ptr, 1, MemoryLayout<T>.size, file)
-        nowIndex += MemoryLayout<T>.size
-        let pointee = ptr.pointee
-        ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-        ptr.deallocate()
-        return pointee
-      }
-      
-      let _ = pointerReader(type: UInt32.self)
-      let lodLevel = pointerReader(type: UInt64.self)
-      for _ in 0..<lodLevel {
-        let size = pointerReader(type: UInt64.self)
-        for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-          _ = pointerReader(type: size_t.self)
-        }
-      }
-      
-      let _ = pointerReader(type: UInt32.self)
-      
-      let tensorDescSize = pointerReader(type: Int32.self)
-      
-      fseek(file, Int(tensorDescSize), SEEK_CUR)
-      nowIndex += Int(tensorDescSize)
-      
-      /*
-       这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
-       */
-      
-      //现在模型传入模型为  Float 类型, 这块应该根据模型来
-      //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-      //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
-      let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
-      
-      guard bytesRead == tensor.data.size else {
-        throw PaddleMobileError.loaderError(message: "param read size error")
-      }
-      
-      // TODO: use script to convert
-      //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-      //            for i in 0..<tensor.numel() {
-      //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-      //            }
-      //            tmpPointer.deinitialize(count: tmpCapacity)
-      //            tmpPointer.deallocate()
-      
-      nowIndex += bytesRead
-    }
-    
-    deinit {
-      fclose(file)
-    }
-  }
-  class ParaLoaderWithPointer {
-    var paramPointer: UnsafeMutableRawPointer
-      let paramSize: Int
-      var nowIndex: Int
-      init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
-          paramPointer = UnsafeMutableRawPointer.init(pPointer)
-          paramSize = pSize
-          nowIndex = 0
-      }
-    
-      func read(tensor: Tensor<P>) throws {
-        guard nowIndex <= paramSize else {
-          throw PaddleMobileError.loaderError(message: "out of the file range")
-        }
-        var readerIndex: Int = 0
-        func pointerReader<T>(type: T.Type) -> T {
-          let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
-          memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout<T>.size)
-          nowIndex += MemoryLayout<T>.size
-          readerIndex += MemoryLayout<T>.size
-          let pointee = ptr.pointee
-          ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-          ptr.deallocate()
-          
-          return pointee
-        }
-        let _ = pointerReader(type: UInt32.self)
-        let lodLevel = pointerReader(type: UInt64.self)
-        for _ in 0..<lodLevel {
-          let size = pointerReader(type: UInt64.self)
-          for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-            _ = pointerReader(type: size_t.self)
-          }
+    class ParaLoader {
+        let file: UnsafeMutablePointer<FILE>
+        let fileSize: Int
+        var nowIndex: Int
+        init(paramPath: String) throws {
+            guard let tmpFile = fopen(paramPath, "rb") else {
+                throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+            }
+            file = tmpFile
+            fseek(file, 0, SEEK_END)
+            fileSize = ftell(file)
+            guard fileSize > 0 else {
+                throw PaddleMobileError.loaderError(message: "param file size is too small")
+            }
+            rewind(file)
+            nowIndex = 0
         }
         
-        let _ = pointerReader(type: UInt32.self)
-        let tensorDescSize = pointerReader(type: Int32.self)
-        
-        paramPointer = paramPointer.advanced(by: Int(readerIndex))
-        paramPointer = paramPointer.advanced(by: Int(tensorDescSize))
-        nowIndex += Int(tensorDescSize)
+        func read(tensor: Tensor<P>) throws {
+            guard nowIndex <= fileSize else {
+                throw PaddleMobileError.loaderError(message: "out of the file range")
+            }
+            
+            func pointerReader<T>(type: T.Type) -> T {
+                let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+                fread(ptr, 1, MemoryLayout<T>.size, file)
+                nowIndex += MemoryLayout<T>.size
+                let pointee = ptr.pointee
+                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+                ptr.deallocate()
+                return pointee
+            }
+            
+            let _ = pointerReader(type: UInt32.self)
+            let lodLevel = pointerReader(type: UInt64.self)
+            for _ in 0..<lodLevel {
+                let size = pointerReader(type: UInt64.self)
+                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+                    _ = pointerReader(type: size_t.self)
+                }
+            }
+            
+            let _ = pointerReader(type: UInt32.self)
+            
+            let tensorDescSize = pointerReader(type: Int32.self)
+            
+            fseek(file, Int(tensorDescSize), SEEK_CUR)
+            nowIndex += Int(tensorDescSize)
+            
+            /*
+             这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
+             */
+            
+            //现在模型传入模型为  Float 类型, 这块应该根据模型来
+            //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+            //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
+            
+            guard bytesRead == tensor.data.size else {
+                throw PaddleMobileError.loaderError(message: "param read size error")
+            }
+            
+            // TODO: use script to convert
+            //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+            //            for i in 0..<tensor.numel() {
+            //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+            //            }
+            //            tmpPointer.deinitialize(count: tmpCapacity)
+            //            tmpPointer.deallocate()
+            
+            nowIndex += bytesRead
+        }
         
-        let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size)
-        paramPointer = paramPointer.advanced(by: Int(tensor.data.size))
-        nowIndex += tensor.data.size
-    }
-    deinit {
+        deinit {
+            fclose(file)
+        }
     }
-  }
-  public init(){}
-  func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program {
-    do {
-        /// swift protobuf serialized Data to instance class
-        //      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
-        //        serializedData: modelData)
+    class ParaLoaderWithPointer {
+        var paramPointer: UnsafeMutableRawPointer
+        let paramSize: Int
+        var nowIndex: Int
+        init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
+            paramPointer = UnsafeMutableRawPointer.init(pPointer)
+            paramSize = pSize
+            nowIndex = 0
+        }
         
-        /// oc protobuf serialized Data to instance class
-      let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data)
-      
-      let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram)
-      let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
-      
-//      let programDesc = PMProgramDesc.init(protoProgram: protoProgram)
-
-      print(programDesc)
-      
-      guard programDesc.blocks.count > 0 else {
-        throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
-      }
-      
-      // to get feed key and fetch key
-      let block = programDesc.blocks[0]
-      guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
-        throw PaddleMobileError.loaderError(message: "at least two operator")
-      }
-      
-      guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
-        throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
-      }
-      
-      guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
-        throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
-      }
-      guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
-        throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
-      }
-      
-      let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
-      
-      // to load memory
-      for block in programDesc.blocks {
-        for varDesc in block.vars {
-          if (varDesc.type == .LodTensor) {
-            guard let tensorDesc = varDesc.tensorDesc else {
-              throw PaddleMobileError.loaderError(message: "get tensor desc failed")
+        func read(tensor: Tensor<P>) throws {
+            guard nowIndex <= paramSize else {
+                throw PaddleMobileError.loaderError(message: "out of the file range")
             }
-            
-            if (varDesc.persistable
-              && varDesc.type != .FeedMiniBatch
-              && varDesc.type != .FetchList) {
-              let dimArr = tensorDesc.dims
-              
-              guard dimArr.count > 0 else {
-                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
-              }
-              
-              let dim = Dim.init(inDim: dimArr)
-              let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
-              do {
-                if paraLoaderPointer != nil {
-                  try paraLoaderPointer!.read(tensor: tensor)
-                }
+            var readerIndex: Int = 0
+            func pointerReader<T>(type: T.Type) -> T {
+                let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+                memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout<T>.size)
+                nowIndex += MemoryLayout<T>.size
+                readerIndex += MemoryLayout<T>.size
+                let pointee = ptr.pointee
+                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+                ptr.deallocate()
                 
-                if paraLoader != nil {
-                  try paraLoader!.read(tensor: tensor)
-                }
-              } catch let error {
-                throw error
-              }
-              //              tensor.convert(to: DataLayout.NHWC())
-              //                            tensor.initBuffer(device: device)
-              scope[varDesc.name] = tensor
-            } else {
-              let dim = Dim.init(inDim: tensorDesc.dims)
-              scope[varDesc.name] = Texture.init(device: device, inDim: dim)
+                return pointee
             }
-          } else {
-            if varDesc.name == fetchKey {
-//              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
-            } else if varDesc.name == feedKey {
+            let _ = pointerReader(type: UInt32.self)
+            let lodLevel = pointerReader(type: UInt64.self)
+            for _ in 0..<lodLevel {
+                let size = pointerReader(type: UInt64.self)
+                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+                    _ = pointerReader(type: size_t.self)
+                }
             }
-          }
+            
+            let _ = pointerReader(type: UInt32.self)
+            let tensorDescSize = pointerReader(type: Int32.self)
+            
+            paramPointer = paramPointer.advanced(by: Int(readerIndex))
+            paramPointer = paramPointer.advanced(by: Int(tensorDescSize))
+            nowIndex += Int(tensorDescSize)
+            
+            let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size)
+            paramPointer = paramPointer.advanced(by: Int(tensor.data.size))
+            nowIndex += tensor.data.size
+        }
+        deinit {
         }
-      }
-      
-      let program = Program.init(inProgramDesc: programDesc, inScope: scope)
-      
-      return program
-    } catch _ {
-      throw PaddleMobileError.loaderError(message: "protobuf decoder error")
-    }
-  }
-  public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program {
-    let modelData = Data.init(bytes:modePointer, count:modelSize)
-    guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
-      throw PaddleMobileError.loaderError(message: "load para error")
-    }
-    do {
-      let program = try loadModelandParam(device,modelData,paraLoader,nil)
-      return program
-    } catch let error {
-      throw error
     }
-  }
-    
-  public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
-    guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
-      throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+    public init(){}
+    func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program {
+        do {
+            /// swift protobuf serialized Data to instance class
+            //      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
+            //        serializedData: modelData)
+            
+            /// oc protobuf serialized Data to instance class
+            let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data)
+            
+            let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram)
+            let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
+            
+            //      let programDesc = PMProgramDesc.init(protoProgram: protoProgram)
+            
+            print(programDesc)
+            
+            guard programDesc.blocks.count > 0 else {
+                throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
+            }
+            
+            // to get feed key and fetch key
+            let block = programDesc.blocks[0]
+            guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
+                throw PaddleMobileError.loaderError(message: "at least two operator")
+            }
+            
+            guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
+                throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
+            }
+            
+            guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
+                throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
+            }
+            guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
+                throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
+            }
+            
+            let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
+            
+            // to load memory
+            for block in programDesc.blocks {
+                for varDesc in block.vars {
+                    if (varDesc.type == .LodTensor) {
+                        guard let tensorDesc = varDesc.tensorDesc else {
+                            throw PaddleMobileError.loaderError(message: "get tensor desc failed")
+                        }
+                        
+                        if (varDesc.persistable
+                            && varDesc.type != .FeedMiniBatch
+                            && varDesc.type != .FetchList) {
+                            let dimArr = tensorDesc.dims
+                            
+                            guard dimArr.count > 0 else {
+                                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
+                            }
+                            
+                            let dim = Dim.init(inDim: dimArr)
+                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+                            do {
+                                if paraLoaderPointer != nil {
+                                    try paraLoaderPointer!.read(tensor: tensor)
+                                }
+                                
+                                if paraLoader != nil {
+                                    try paraLoader!.read(tensor: tensor)
+                                }
+                            } catch let error {
+                                throw error
+                            }
+                            //              tensor.convert(to: DataLayout.NHWC())
+                            //                            tensor.initBuffer(device: device)
+                            scope[varDesc.name] = tensor
+                        } else {
+                            let dim = Dim.init(inDim: tensorDesc.dims)
+                            scope[varDesc.name] = Texture.init(device: device, inDim: dim)
+                        }
+                    } else {
+                        if varDesc.name == fetchKey {
+                            //              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
+                        } else if varDesc.name == feedKey {
+                        }
+                    }
+                }
+            }
+            
+            let program = Program.init(inProgramDesc: programDesc, inScope: scope)
+            
+            return program
+        } catch _ {
+            throw PaddleMobileError.loaderError(message: "protobuf decoder error")
+        }
     }
-    guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
-      throw PaddleMobileError.loaderError(message: "load para error")
+    public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program {
+        let modelData = Data.init(bytes:modePointer, count:modelSize)
+        guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
+            throw PaddleMobileError.loaderError(message: "load para error")
+        }
+        do {
+            let program = try loadModelandParam(device,modelData,paraLoader,nil)
+            return program
+        } catch let error {
+            throw error
+        }
     }
     
-    do {
-      let program = try loadModelandParam(device,modelData,nil,paraLoader)
-      return program
-    } catch let error {
-      throw error
+    public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
+        guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
+            throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+        }
+        guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
+            throw PaddleMobileError.loaderError(message: "load para error")
+        }
+        
+        do {
+            let program = try loadModelandParam(device,modelData,nil,paraLoader)
+            return program
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
index 97fe0a8fba..adce101552 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
@@ -17,337 +17,337 @@ import MetalKit
 import CoreMedia
 
 protocol Tensorial: Variant {
-  var dim: Dim { get set }
-  func numel() -> Int
-  var layout: DataLayout { get }
+    var dim: Dim { get set }
+    func numel() -> Int
+    var layout: DataLayout { get }
 }
 
 extension Tensorial {
-  func numel() -> Int {
-    return dim.numel()
-  }
+    func numel() -> Int {
+        return dim.numel()
+    }
 }
 
 
 
 class Tensor<P: PrecisionType>: Tensorial {
-  
-  var data: Data
-  var dim: Dim
-  var buffer: MTLBuffer!
-  private(set) var layout: DataLayout
-  
-  class Data {
-    init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
-      size = inSize
-      pointer = inPointer
-    }
-    let size: Int
-    var pointer: UnsafeMutablePointer<P>
-    subscript(index: Int) -> P{
-      get {
-        return pointer[index]
-      }
-      set {
-        pointer[index] = newValue
-      }
-    }
-    func release() {
-      pointer.deinitialize(count: size)
-      pointer.deallocate()
-    }
-    deinit {
-      //            release()
-    }
-  }
-  
-  init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
-    dim = inDim
-    let size = inDim.numel() * MemoryLayout<P>.size
-    let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
-    data = Data.init(inSize: size, inPointer: pointer)
-    layout = inLayout
-  }
-  
-  func convert(to: DataLayout) {
-    guard to != layout else {
-      return
-    }
     
-    guard dim.cout() == 4 else {
-      return
-    }
+    var data: Data
+    var dim: Dim
+    var buffer: MTLBuffer!
+    private(set) var layout: DataLayout
     
-    guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
-      // other not support
-      return
-    }
-    let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
-    
-    if layout == DataLayout.NCHW() {
-      NCHW2NHWC(newPtr: newPointer)
+    class Data {
+        init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
+            size = inSize
+            pointer = inPointer
+        }
+        let size: Int
+        var pointer: UnsafeMutablePointer<P>
+        subscript(index: Int) -> P{
+            get {
+                return pointer[index]
+            }
+            set {
+                pointer[index] = newValue
+            }
+        }
+        func release() {
+            pointer.deinitialize(count: size)
+            pointer.deallocate()
+        }
+        deinit {
+            //            release()
+        }
     }
     
-    data.release()
-    data.pointer = newPointer
-    layout = to
-  }
-  
-
-  
-  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) {
-    if convertToNHWC {
-//      print(layout)
-      convert(to: DataLayout.NHWC())
+    init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+        dim = inDim
+        let size = inDim.numel() * MemoryLayout<P>.size
+        let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
+        data = Data.init(inSize: size, inPointer: pointer)
+        layout = inLayout
     }
     
-    if withTranspose {
-      let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
-      let n = dim[0]
-      let hwc = numel()/n
-      for j in 0..<hwc {
-        for i in 0..<n {
-          //data[i * hwc + j]
-          transposePointer[j * n + i] = data[i * hwc + j]
+    func convert(to: DataLayout) {
+        guard to != layout else {
+            return
         }
-      }
-
-      dim.swapeDimAt(index1: 0, index2: 3)
-      data.release()
-      data.pointer = transposePointer
+        
+        guard dim.cout() == 4 else {
+            return
+        }
+        
+        guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
+            // other not support
+            return
+        }
+        let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
+        
+        if layout == DataLayout.NCHW() {
+            NCHW2NHWC(newPtr: newPointer)
+        }
+        
+        data.release()
+        data.pointer = newPointer
+        layout = to
     }
     
-    guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
-      fatalError(" not support yet ")
-    }
     
-    let precisionSize: Int
-    switch precision {
-    case .Float32:
-      precisionSize = 4
-    case .Float16:
-      precisionSize = 2
-    }
     
-    if dim.cout() == 4 {
-      if layout == DataLayout.NHWC() {
-        let C = dim[3]
-        let cSlices = (C + 3) / 4
-        let paddedC = cSlices * 4
-        let count = paddedC * dim[0] * dim[1] * dim[2]
-        if C == paddedC {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
-          }
-        } else if C == 1 && !padWhenOneC {
-          buffer = device.makeBuffer(length: numel() * precisionSize)
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-          }
-        } else {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
-          var tmpPointer = floatPointer
-          var dstPtr = convertedPointer
-          for _ in 0..<dim[0] * dim[1] * dim[2] {
-            for j in 0..<paddedC {
-              if j < C {
-                dstPtr[j] = tmpPointer[j]
-              } else {
-                dstPtr[j] = 0
-              }
+    func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) {
+        if convertToNHWC {
+            //      print(layout)
+            convert(to: DataLayout.NHWC())
+        }
+        
+        if withTranspose {
+            let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
+            let n = dim[0]
+            let hwc = numel()/n
+            for j in 0..<hwc {
+                for i in 0..<n {
+                    //data[i * hwc + j]
+                    transposePointer[j * n + i] = data[i * hwc + j]
+                }
             }
-            tmpPointer += C
-            dstPtr += paddedC
-          }
-          
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
-          }
-          
-          convertedPointer.deinitialize(count: count)
-          convertedPointer.deallocate()
+            
+            dim.swapeDimAt(index1: 0, index2: 3)
+            data.release()
+            data.pointer = transposePointer
         }
-      } else {
-        let C = dim[3]
-        let cSlices = (C + 3) / 4
-        let paddedC = cSlices * 4
-        let count = paddedC * dim[0] * dim[1] * dim[2]
-        if C == paddedC {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
-          }
-        } else if C == 1 {
-          fatalError(" not support ")
-        } else {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
-          var tmpPointer = floatPointer
-          var dstPtr = convertedPointer
-          for _ in 0..<dim[0] * dim[1] * dim[2] {
-            for j in 0..<paddedC {
-              if j < C {
-                dstPtr[j] = tmpPointer[j]
-              } else {
-                dstPtr[j] = 0
-              }
+        
+        guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
+            fatalError(" not support yet ")
+        }
+        
+        let precisionSize: Int
+        switch precision {
+        case .Float32:
+            precisionSize = 4
+        case .Float16:
+            precisionSize = 2
+        }
+        
+        if dim.cout() == 4 {
+            if layout == DataLayout.NHWC() {
+                let C = dim[3]
+                let cSlices = (C + 3) / 4
+                let paddedC = cSlices * 4
+                let count = paddedC * dim[0] * dim[1] * dim[2]
+                if C == paddedC {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+                    }
+                } else if C == 1 && !padWhenOneC {
+                    buffer = device.makeBuffer(length: numel() * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+                    }
+                } else {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+                    var tmpPointer = floatPointer
+                    var dstPtr = convertedPointer
+                    for _ in 0..<dim[0] * dim[1] * dim[2] {
+                        for j in 0..<paddedC {
+                            if j < C {
+                                dstPtr[j] = tmpPointer[j]
+                            } else {
+                                dstPtr[j] = 0
+                            }
+                        }
+                        tmpPointer += C
+                        dstPtr += paddedC
+                    }
+                    
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+                    }
+                    
+                    convertedPointer.deinitialize(count: count)
+                    convertedPointer.deallocate()
+                }
+            } else {
+                let C = dim[3]
+                let cSlices = (C + 3) / 4
+                let paddedC = cSlices * 4
+                let count = paddedC * dim[0] * dim[1] * dim[2]
+                if C == paddedC {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+                    }
+                } else if C == 1 {
+                    fatalError(" not support ")
+                } else {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+                    var tmpPointer = floatPointer
+                    var dstPtr = convertedPointer
+                    for _ in 0..<dim[0] * dim[1] * dim[2] {
+                        for j in 0..<paddedC {
+                            if j < C {
+                                dstPtr[j] = tmpPointer[j]
+                            } else {
+                                dstPtr[j] = 0
+                            }
+                        }
+                        tmpPointer += C
+                        dstPtr += paddedC
+                    }
+                    
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+                    }
+                    convertedPointer.deinitialize(count: count)
+                    convertedPointer.deallocate()
+                }
             }
-            tmpPointer += C
-            dstPtr += paddedC
-          }
-          
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
-          }
-          convertedPointer.deinitialize(count: count)
-          convertedPointer.deallocate()
+        } else if dim.cout() == 1 {
+            let num = ((numel() + 3) / 4) * 4
+            buffer = device.makeBuffer(length: num * precisionSize)
+            switch precision {
+            case .Float32:
+                buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
+            case .Float16:
+                float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
+            }
+        } else {
+            fatalError(" not support !")
         }
-      }
-    } else if dim.cout() == 1 {
-      let num = ((numel() + 3) / 4) * 4
-      buffer = device.makeBuffer(length: num * precisionSize)
-      switch precision {
-      case .Float32:
-        buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
-      case .Float16:
-        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
-      }
-    } else {
-      fatalError(" not support !")
+        //TODO: release
+        data.release()
     }
-    //TODO: release
-    data.release()
-  }
-  
-  var n: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NCHW() {
-          return dim[0]
-        } else if layout == DataLayout.NHWC() {
-          return dim[0]
-        } else {
-          fatalError(" unsupport ")
+    
+    var n: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NCHW() {
+                    return dim[0]
+                } else if layout == DataLayout.NHWC() {
+                    return dim[0]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  var width: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NHWC() {
-          return dim[2]
-        } else if layout == DataLayout.NCHW() {
-          return dim[3]
-        } else {
-          fatalError(" unsupport ")
+    
+    var width: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NHWC() {
+                    return dim[2]
+                } else if layout == DataLayout.NCHW() {
+                    return dim[3]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  var height: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NHWC() {
-          return dim[1]
-        } else if layout == DataLayout.NCHW() {
-          return dim[2]
-        } else {
-          fatalError(" unsupport ")
+    
+    var height: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NHWC() {
+                    return dim[1]
+                } else if layout == DataLayout.NCHW() {
+                    return dim[2]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  var channel: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NHWC() {
-          return dim[3]
-        } else if layout == DataLayout.NCHW() {
-          return dim[1]
-        } else {
-          fatalError(" unsupport ")
+    
+    var channel: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NHWC() {
+                    return dim[3]
+                } else if layout == DataLayout.NCHW() {
+                    return dim[1]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  
-  func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
-    let N = dim[0]
-    let C = dim[1]
-    let H = dim[2]
-    let W = dim[3]
-    let HXW = H * W
-    let CXHXW = C * H * W
     
-    var index: Int = 0
-    for n in 0..<N {
-      for h in 0..<H{
-        for w in 0..<W{
-          for c in 0..<C{
-            newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
-            index += 1
-          }
+    
+    func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
+        let N = dim[0]
+        let C = dim[1]
+        let H = dim[2]
+        let W = dim[3]
+        let HXW = H * W
+        let CXHXW = C * H * W
+        
+        var index: Int = 0
+        for n in 0..<N {
+            for h in 0..<H{
+                for w in 0..<W{
+                    for c in 0..<C{
+                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
+                        index += 1
+                    }
+                }
+            }
         }
-      }
+        dim.swapeDimAt(index1: 1, index2: 3)
     }
-    dim.swapeDimAt(index1: 1, index2: 3)
-  }
 }
 
 extension Tensor {
-  
-  var debugDescription: String {
-    var str = "dim: \(dim) \n"
-    str += "MTLBuffer: \(self.buffer) \n"
-    for i in 0..<buffer.length/MemoryLayout<P>.size {
-      str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
+    
+    var debugDescription: String {
+        var str = "dim: \(dim) \n"
+        str += "MTLBuffer: \(self.buffer) \n"
+        for i in 0..<buffer.length/MemoryLayout<P>.size {
+            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
+        }
+        return str
+    }
+    
+    func logDataPointer(header: String = "") {
+        print(header)
+        var str = ""
+        str += "data size: \(data.size) \n"
+        str += "dim: \(dim) \n"
+        for i in 0..<numel() {
+            str += " \(data.pointer[i])"
+        }
+        print(str)
     }
-    return str
-  }
-  
-  func logDataPointer(header: String = "") {
-    print(header)
-    var str = ""
-    str += "data size: \(data.size) \n"
-    str += "dim: \(dim) \n"
-    for i in 0..<numel() {
-      str += " \(data.pointer[i])"
+    
+    var description: String {
+        return debugDescription
     }
-    print(str)
-  }
-  
-  var description: String {
-    return debugDescription
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
index 14631464d8..cc1ed05e12 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
@@ -16,26 +16,26 @@ import Metal
 import Foundation
 
 class InputTexture {
-  let mtlTexture: MTLTexture
-  let expectDim: Dim
-  init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
-    mtlTexture = inMTLTexture
-    expectDim = inExpectDim
-  }
+    let mtlTexture: MTLTexture
+    let expectDim: Dim
+    init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
+        mtlTexture = inMTLTexture
+        expectDim = inExpectDim
+    }
 }
 
 extension InputTexture {
-  var description: String {
-    get{
-      return mtlTexture.description
+    var description: String {
+        get{
+            return mtlTexture.description
+        }
     }
-  }
-  
-  var debugDescription: String {
-    get {
-      return mtlTexture.debugDescription ?? " MetalTexture "
+    
+    var debugDescription: String {
+        get {
+            return mtlTexture.debugDescription ?? " MetalTexture "
+        }
     }
-  }
 }
 
 
@@ -46,17 +46,17 @@ extension InputTexture {
  .height = b
  .len = a * d + 3 / 4
  
-低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
+ 低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
  
-// TODO transpose 对于低维 tensor 的扩展原则。。。
-// [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
-// [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
-// [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
-
-3 维 tensor [a, b, c] 对应的 texture_2darray,
-.width = c
-.height = b
-.len = a + 3 / 4
+ // TODO transpose 对于低维 tensor 的扩展原则。。。
+ // [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
+ // [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
+ // [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
+ 
+ 3 维 tensor [a, b, c] 对应的 texture_2darray,
+ .width = c
+ .height = b
+ .len = a + 3 / 4
  
  2 维 tensor [a, b] 对应的 texture_2darray
  .width = b + 3 / 4
@@ -69,136 +69,136 @@ extension InputTexture {
  .len = 1
  */
 public class Texture: Tensorial {
-  public var dim: Dim
-  public var tensorDim: Dim
-  
-  /// tensor dim pad to four
-  public var padToFourDim: Dim
-  private var textureDesc: MTLTextureDescriptor!
-  public var metalTexture: MTLTexture!
-  var transpose: [Int] = [0, 1, 2, 3]
-  
-  func elementCount() -> Int {
-    return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4
-  }
-  
-  func toTensor() -> [Float32] {
-    guard  padToFourDim.cout() == 4 else {
-      fatalError("- not support -")
-    }
-    return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-  }
-  
-  func realNHWC() -> [Float32] {
-    guard padToFourDim.cout() == 4 else {
-      fatalError(" - not support - ")
-    }
-    return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-  }
-  
-  public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
-    transpose = inTranspose
-    for i in 0..<(4 - tensorDim.cout()) {
-      if i != inTranspose[i] {
-        fatalError()
-      }
-    }
+    public var dim: Dim
+    public var tensorDim: Dim
     
-    let newDim = transpose.map { padToFourDim[$0] }
-    let newLayout = transpose.map { layout.layoutWithDim[$0] }
+    /// tensor dim pad to four
+    public var padToFourDim: Dim
+    private var textureDesc: MTLTextureDescriptor!
+    public var metalTexture: MTLTexture!
+    var transpose: [Int] = [0, 1, 2, 3]
     
-    layout = DataLayout.init(newLayout)
-    dim = Dim.init(inDim: newDim)
+    func elementCount() -> Int {
+        return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4
+    }
     
-    let tmpTextureDes = MTLTextureDescriptor.init()
-    tmpTextureDes.textureType = .type2DArray
-    tmpTextureDes.depth = 1
+    func toTensor() -> [Float32] {
+        guard  padToFourDim.cout() == 4 else {
+            fatalError("- not support -")
+        }
+        return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+    }
     
-    switch tensorDim.cout() {
-    case 4:
-      tmpTextureDes.width = newDim[2]
-      tmpTextureDes.height = newDim[1]
-      tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
-    case 3:
-      tmpTextureDes.width = newDim[3]
-      tmpTextureDes.height = newDim[2]
-      tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
-    case 2, 1:
-      tmpTextureDes.width = (newDim[3] + 3) / 4
-      tmpTextureDes.height = newDim[2]
-      tmpTextureDes.arrayLength = 1
-    default:
-      fatalError("unreachable")
+    func realNHWC() -> [Float32] {
+        guard padToFourDim.cout() == 4 else {
+            fatalError(" - not support - ")
+        }
+        return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
     }
-   
-    if computePrecision == .Float16 {
-      tmpTextureDes.pixelFormat = .rgba16Float
-    } else if computePrecision == .Float32 {
-      tmpTextureDes.pixelFormat = .rgba32Float
+    
+    public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
+        transpose = inTranspose
+        for i in 0..<(4 - tensorDim.cout()) {
+            if i != inTranspose[i] {
+                fatalError()
+            }
+        }
+        
+        let newDim = transpose.map { padToFourDim[$0] }
+        let newLayout = transpose.map { layout.layoutWithDim[$0] }
+        
+        layout = DataLayout.init(newLayout)
+        dim = Dim.init(inDim: newDim)
+        
+        let tmpTextureDes = MTLTextureDescriptor.init()
+        tmpTextureDes.textureType = .type2DArray
+        tmpTextureDes.depth = 1
+        
+        switch tensorDim.cout() {
+        case 4:
+            tmpTextureDes.width = newDim[2]
+            tmpTextureDes.height = newDim[1]
+            tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
+        case 3:
+            tmpTextureDes.width = newDim[3]
+            tmpTextureDes.height = newDim[2]
+            tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
+        case 2, 1:
+            tmpTextureDes.width = (newDim[3] + 3) / 4
+            tmpTextureDes.height = newDim[2]
+            tmpTextureDes.arrayLength = 1
+        default:
+            fatalError("unreachable")
+        }
+        
+        if computePrecision == .Float16 {
+            tmpTextureDes.pixelFormat = .rgba16Float
+        } else if computePrecision == .Float32 {
+            tmpTextureDes.pixelFormat = .rgba32Float
+        }
+        
+        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
+        tmpTextureDes.storageMode = .shared
+        textureDesc = tmpTextureDes
+        metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
     }
     
-    tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-    tmpTextureDes.storageMode = .shared
-    textureDesc = tmpTextureDes
-    metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
-  }
-  
-  public func updateDims(inTensorDim: Dim, inDim: Dim) {
-    var fourDim: Dim
-    if inDim.cout() == 4 {
-      fourDim = inDim
-    } else if inDim.cout() < 4 {
-      var fourDimNum: [Int] = []
-      for _ in 0..<(4 - inDim.cout()) {
-        fourDimNum.append(1)
-      }
-      fourDimNum.append(contentsOf: inDim.dims)
-      fourDim = Dim.init(inDim: fourDimNum)
-    } else {
-      fatalError(" not support ")
+    public func updateDims(inTensorDim: Dim, inDim: Dim) {
+        var fourDim: Dim
+        if inDim.cout() == 4 {
+            fourDim = inDim
+        } else if inDim.cout() < 4 {
+            var fourDimNum: [Int] = []
+            for _ in 0..<(4 - inDim.cout()) {
+                fourDimNum.append(1)
+            }
+            fourDimNum.append(contentsOf: inDim.dims)
+            fourDim = Dim.init(inDim: fourDimNum)
+        } else {
+            fatalError(" not support ")
+        }
+        
+        tensorDim = inTensorDim
+        dim = fourDim
+        padToFourDim = fourDim
     }
     
-    tensorDim = inTensorDim
-    dim = fourDim
-    padToFourDim = fourDim
-  }
-  
-  // 初始化时 dim padToFourDim 模型中的维度（一般来说 nchw），前面补全0
-  init(device: MTLDevice, inDim: Dim) {
-    print(" in dim > \(inDim)")
-    var fourDim: Dim
-    if inDim.cout() == 4 {
-      fourDim = inDim
-    } else if inDim.cout() < 4 {
-      var fourDimNum: [Int] = []
-      for _ in 0..<(4 - inDim.cout()) {
-        fourDimNum.append(1)
-      }
-      fourDimNum.append(contentsOf: inDim.dims)
-      fourDim = Dim.init(inDim: fourDimNum)
-    } else {
-      fatalError(" not support ")
+    // 初始化时 dim padToFourDim 模型中的维度（一般来说 nchw），前面补全0
+    init(device: MTLDevice, inDim: Dim) {
+        print(" in dim > \(inDim)")
+        var fourDim: Dim
+        if inDim.cout() == 4 {
+            fourDim = inDim
+        } else if inDim.cout() < 4 {
+            var fourDimNum: [Int] = []
+            for _ in 0..<(4 - inDim.cout()) {
+                fourDimNum.append(1)
+            }
+            fourDimNum.append(contentsOf: inDim.dims)
+            fourDim = Dim.init(inDim: fourDimNum)
+        } else {
+            fatalError(" not support ")
+        }
+        tensorDim = inDim
+        dim = fourDim
+        padToFourDim = fourDim
+        layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
     }
-    tensorDim = inDim
-    dim = fourDim
-    padToFourDim = fourDim
-    layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
-  }
-  
-  private(set) var layout: DataLayout
+    
+    private(set) var layout: DataLayout
 }
 
 extension Texture {
-  public var description: String {
-    return debugDescription
-  }
-  
-  public var debugDescription: String{
-    var str = ""
-    str += "Dim: \(dim) \n value:[ "
-    str += "\(metalTexture)"
-    str += " ]"
-    return str
-  }
-  
+    public var description: String {
+        return debugDescription
+    }
+    
+    public var debugDescription: String{
+        var str = ""
+        str += "Dim: \(dim) \n value:[ "
+        str += "\(metalTexture)"
+        str += " ]"
+        return str
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
index fcedbd36f7..f16344e500 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
@@ -27,7 +27,7 @@ class OpCreator<P: PrecisionType> {
         }
     }
     
-  func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable {
+    func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable {
         guard let opCreator = opCreators[opDesc.type] else {
             throw PaddleMobileError.opError(message: "there is no " + opDesc.type + " yet")
         }
@@ -69,6 +69,6 @@ class OpCreator<P: PrecisionType> {
          gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
          gElementwiseAddPreluType   :     ElementwiseAddPreluOp<P>.creat,
          gFusionConvAddType         :     ConvAddOp<P>.creat]
-  
+    
     private init(){}
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
index 01c2216664..0af90e411b 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
@@ -22,199 +22,199 @@ import Foundation
  */
 
 protocol OpParam {
-  associatedtype OutputType: Variant
-  var output: OutputType { get set }
-  func outputDesc() -> String
-  
-  //associatedtype ParamPrecisionType: PrecisionType
-  init(opDesc: PMOpDesc, inScope: Scope) throws
-  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
-  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
-  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
-  
-  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  
+    associatedtype OutputType: Variant
+    var output: OutputType { get set }
+    func outputDesc() -> String
+    
+    //associatedtype ParamPrecisionType: PrecisionType
+    init(opDesc: PMOpDesc, inScope: Scope) throws
+    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
+    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
+    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+    
+    static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    
 }
 
 extension OpParam {
-  func outputDesc() -> String {
-    return output.debugDescription
-  }
-  
-  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
-    guard let mapKeys = map[key], mapKeys.count > 0 else {
-      throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
+    func outputDesc() -> String {
+        return output.debugDescription
+    }
+    
+    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
+        guard let mapKeys = map[key], mapKeys.count > 0 else {
+            throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
+        }
+        guard let variant = from[mapKeys[0]] else {
+            throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
+        }
+        
+        guard let v = variant as? VarType else {
+            throw PaddleMobileError.paramError(message: " type error")
+            
+        }
+        return v
     }
-    guard let variant = from[mapKeys[0]] else {
-      throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
+    
+    static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
+            return tensorVariances
+        } catch let error {
+            throw error
+        }
     }
     
-    guard let v = variant as? VarType else {
-      throw PaddleMobileError.paramError(message: " type error")
-
+    static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
+            return alphaTensor
+        } catch let error {
+            throw error
+        }
+    }
+    
+    
+    static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
+            return tensorImage
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
+            return tensorX
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
+            return tensorBox
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
+            return tensorInput
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
+            return tensorOutput
+        } catch let error {
+            throw error
+        }
+    }
+    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
+            return tensorOutputY
+        } catch let error {
+            throw error
+        }
+    }
+    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
+            return tensorY
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
+            return out
+        } catch let error {
+            throw error
+        }
+    }
+    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
+            return tensorFilter
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
+            return tensorBias
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
+            return tensorMean
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
+            return tensorScale
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
+            return tensorVariance
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
+        guard let attr = attrs[key] else {
+            throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
+        }
+        
+        guard let tAttr = attr as? T else {
+            throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
+        }
+        return tAttr
     }
-    return v
-  }
-  
-  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
-      return tensorVariances
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
-      return alphaTensor
-    } catch let error {
-      throw error
-    }
-  }
-  
-  
-  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
-      return tensorImage
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
-      return tensorX
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
-      return tensorBox
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
-      return tensorInput
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
-      return tensorOutput
-    } catch let error {
-      throw error
-    }
-  }
-  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
-      return tensorOutputY
-    } catch let error {
-      throw error
-    }
-  }
-  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
-      return tensorY
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
-      return out
-    } catch let error {
-      throw error
-    }
-  }
-  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
-      return tensorFilter
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
-      return tensorBias
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
-      return tensorMean
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
-      return tensorScale
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
-      return tensorVariance
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
-    guard let attr = attrs[key] else {
-      throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
-    }
-    
-    guard let tAttr = attr as? T else {
-      throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
-    }
-    return tAttr
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
index 532d1b661d..df7a765d2d 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
@@ -16,129 +16,129 @@ import Metal
 import Foundation
 
 protocol Fusion {
-  static func fusionNode() -> Node
-  static func change() -> [String : [(from: String, to: String)]]
-  static func fusionType() -> String
-  static func needCheck() -> [(Int, String)]
+    static func fusionNode() -> Node
+    static func change() -> [String : [(from: String, to: String)]]
+    static func fusionType() -> String
+    static func needCheck() -> [(Int, String)]
 }
 extension Fusion {
-  static func needCheck() -> [(Int, String)] {
-    return []
-  }
+    static func needCheck() -> [(Int, String)] {
+        return []
+    }
 }
 
 protocol Runable {
-  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
-  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
-  func delogOutput()
-  func inputVariant() -> [String : [MTLBuffer]]
-  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
+    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
+    func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
+    func delogOutput()
+    func inputVariant() -> [String : [MTLBuffer]]
+    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
 }
 
 extension Runable where Self: OperatorProtocol{
-  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try runImpl(device: device, buffer: buffer)
-    } catch let error {
-      throw error
+    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try runImpl(device: device, buffer: buffer)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func inputVariant() -> [String : [MTLBuffer]] {
+        //    return [:]
+        fatalError(" op \(type) need implement inputVariant")
+    }
+    
+    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+        fatalError(" need implement ")
     }
-  }
-  
-  func inputVariant() -> [String : [MTLBuffer]] {
-//    return [:]
-    fatalError(" op \(type) need implement inputVariant")
-  }
-  
-  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
-    fatalError(" need implement ")
-  }
-  
-  func delogOutput() {
     
-    print(type + ": has no implementation" )
-  }
+    func delogOutput() {
+        
+        print(type + ": has no implementation" )
+    }
 }
 
 public class InitContext {
-  /// metal 代码加载方式
-  var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib
-  /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
-  var metalLibPath: String? = nil
-  init() {
-    metalLoadMode = .LoadMetalInDefaultLib
-    metalLibPath = nil
-  }
+    /// metal 代码加载方式
+    var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib
+    /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
+    var metalLibPath: String? = nil
+    init() {
+        metalLoadMode = .LoadMetalInDefaultLib
+        metalLibPath = nil
+    }
 }
 
 protocol Creator where Self: OperatorProtocol{
-  associatedtype OpType: OperatorProtocol & Runable & InferShaperable
-  static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType
+    associatedtype OpType: OperatorProtocol & Runable & InferShaperable
+    static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType
 }
 
 extension Creator where Self: OperatorProtocol {
-  static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType {
-    do {
-      return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext)
-    } catch let error {
-      throw error
+    static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType {
+        do {
+            return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext)
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
 
 protocol InferShaperable {
-  func inferShape()
+    func inferShape()
 }
 
 protocol OperatorProtocol {
-  associatedtype ParamType
-  associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
-  var type: String { get }
-  var scope: Scope { get }
-  var inputs: [String : [String]] { get }
-  var paraInputs: [String : [String]] { get set }
-  var outpus: [String : [String]] { get }
-  var attrs: [String : Attr] { get }
-  var para: ParamType { get }
-  var kernel: KerType { get }
-  init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws
+    associatedtype ParamType
+    associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
+    var type: String { get }
+    var scope: Scope { get }
+    var inputs: [String : [String]] { get }
+    var paraInputs: [String : [String]] { get set }
+    var outpus: [String : [String]] { get }
+    var attrs: [String : Attr] { get }
+    var para: ParamType { get }
+    var kernel: KerType { get }
+    init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws
 }
 
 extension OperatorProtocol {
-  static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self {
-    do {
-      return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext)
-    } catch let error {
-      throw error
+    static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self {
+        do {
+            return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext)
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
 
 class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where KernelType.ParamType == ParameterType {
-  required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws {
-    type = opDesc.type
-    scope = inScope
-    inputs = opDesc.inputs
-    outpus = opDesc.outputs
-    attrs =  opDesc.attrs
-    paraInputs = opDesc.paraInputs
-    do {
-      para = try ParamType.init(opDesc:opDesc, inScope: inScope)
-    } catch let error {
-      throw error
+    required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws {
+        type = opDesc.type
+        scope = inScope
+        inputs = opDesc.inputs
+        outpus = opDesc.outputs
+        attrs =  opDesc.attrs
+        paraInputs = opDesc.paraInputs
+        do {
+            para = try ParamType.init(opDesc:opDesc, inScope: inScope)
+        } catch let error {
+            throw error
+        }
+        kernel = KernelType.init(device: device, param: para, initContext: initContext)
     }
-    kernel = KernelType.init(device: device, param: para, initContext: initContext)
-  }
-  
-  typealias ParamType = ParameterType
-  typealias KerType = KernelType
-  let type: String
-  let inputs: [String : [String]]
-  var paraInputs: [String : [String]]
-  let outpus: [String : [String]]
-  let attrs: [String : Attr]
-  let para: ParamType
-  let scope: Scope
-  var kernel: KerType
+    
+    typealias ParamType = ParameterType
+    typealias KerType = KernelType
+    let type: String
+    let inputs: [String : [String]]
+    var paraInputs: [String : [String]]
+    let outpus: [String : [String]]
+    let attrs: [String : Attr]
+    let para: ParamType
+    let scope: Scope
+    var kernel: KerType
 }
 
 // op infos
@@ -202,4 +202,4 @@ let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Out
                gConvAddAddPreluType         : (inputs: ["Input"], outputs: ["Out"]),
                gElementwiseAddPreluType     : (inputs: ["X"], outputs: ["Out"]),
                gFusionConvAddType           : (inputs: ["Input"], outputs: ["Out"])
-              ]
+]
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
index a877620416..904e04c468 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
@@ -16,52 +16,52 @@ import Foundation
 import Metal
 
 class BatchNormParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
-      if input.transpose != [0, 2, 3, 1] {
-        fatalError("batch norm only accepts NHWC")
-      }
-      output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
-      bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
-      mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
-      scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
-      variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
-      epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-      momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
+            if input.transpose != [0, 2, 3, 1] {
+                fatalError("batch norm only accepts NHWC")
+            }
+            output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
+            bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
+            mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
+            scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
+            variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
+            epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+            momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let bias: Tensor<P>
-  let mean: Tensor<P>
-  let scale: Tensor<P>
-  let variance: Tensor<P>
-  let epsilon: Float
-  let momentum: Float
+    let input: Texture
+    var output: Texture
+    let bias: Tensor<P>
+    let mean: Tensor<P>
+    let scale: Tensor<P>
+    let variance: Tensor<P>
+    let epsilon: Float
+    let momentum: Float
 }
 
 class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam<P>>, Runable, Creator, InferShaperable{
-  typealias OpType = BatchNormOp<P>
-
-  func inferShape() {
-    para.output.dim = para.input.dim
-  }
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = BatchNormOp<P>
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
+    }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
index a19dd10390..e44a49d900 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
@@ -16,50 +16,50 @@ import Foundation
 import Metal
 
 class BilinearInterpParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
-      out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+            out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+            fatalError()
+        }
     }
-    if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
-      fatalError()
-    }
-  }
-  let input: Texture
-  var output: Texture
-  let out_h: Int
-  let out_w: Int
+    let input: Texture
+    var output: Texture
+    let out_h: Int
+    let out_w: Int
 }
 
 class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, BilinearInterpParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = BilinearInterpOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = BilinearInterpOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        //    print(outputArray)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-//    print(outputArray)
-    print(outputArray.strideArray())
-  }
-  
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
index 4679885ab6..442d1af9ea 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
@@ -15,69 +15,69 @@
 import Foundation
 
 class BoxcoderParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
-      priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
-      targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
-      output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
-      codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
-      boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
+            priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
+            targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
+            output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
+            codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
+            boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        assert(priorBox.tensorDim.cout() == 2)
+        assert(priorBoxVar.tensorDim.cout() == 2)
+        assert(targetBox.tensorDim.cout() == 3)
+        assert(output.tensorDim.cout() == 3)
+        assert(priorBox.transpose == [0, 1, 2, 3])
+        assert(priorBoxVar.transpose == [0, 1, 2, 3])
+        assert(targetBox.transpose == [0, 1, 2, 3])
+        assert(codeType == "decode_center_size") // encode_center_size is not implemented
+        assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
     }
-    assert(priorBox.tensorDim.cout() == 2)
-    assert(priorBoxVar.tensorDim.cout() == 2)
-    assert(targetBox.tensorDim.cout() == 3)
-    assert(output.tensorDim.cout() == 3)
-    assert(priorBox.transpose == [0, 1, 2, 3])
-    assert(priorBoxVar.transpose == [0, 1, 2, 3])
-    assert(targetBox.transpose == [0, 1, 2, 3])
-    assert(codeType == "decode_center_size") // encode_center_size is not implemented
-    assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
-  }
-  let priorBox: Texture
-  let priorBoxVar: Texture
-  let targetBox: Texture
-  var output: Texture
-  let codeType: String
-  let boxNormalized: Bool
+    let priorBox: Texture
+    let priorBoxVar: Texture
+    let targetBox: Texture
+    var output: Texture
+    let codeType: String
+    let boxNormalized: Bool
 }
 
 class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = BoxcoderOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = BoxcoderOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
+        let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
+        let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
+        let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(" prior box var ")
+        print(pbv.strideArray())
+        print(" target box ")
+        print(tb.strideArray())
+        print(" prior box ")
+        print(pb.strideArray())
+        print(" output ")
+        print(out.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
-    let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
-    let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
-    let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(" prior box var ")
-    print(pbv.strideArray())
-    print(" target box ")
-    print(tb.strideArray())
-    print(" prior box ")
-    print(pb.strideArray())
-    print(" output ")
-    print(out.strideArray())
-  }
-  
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
index c2c22d55af..a8034c681f 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
@@ -15,62 +15,62 @@
 import Foundation
 
 class ConcatParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      guard let xlist = opDesc.inputs["X"] else {
-        fatalError()
-      }
-      for x in xlist {
-        guard let variant = inScope[x], let v = variant as? Texture else {
-          fatalError()
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            guard let xlist = opDesc.inputs["X"] else {
+                fatalError()
+            }
+            for x in xlist {
+                guard let variant = inScope[x], let v = variant as? Texture else {
+                    fatalError()
+                }
+                if transpose.count == 0 {
+                    transpose = v.transpose
+                }
+                if v.transpose != transpose {
+                    fatalError()
+                }
+                
+                input.append(v)
+            }
+            axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
         }
-        if transpose.count == 0 {
-          transpose = v.transpose
-        }
-        if v.transpose != transpose {
-          fatalError()
-        }
-       
-        input.append(v)
-      }
-      axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
-      output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
     }
-  }
-  var input: [Texture] = []
-  var output: Texture
-  var transpose: [Int] = []
-  let axis: Int
+    var input: [Texture] = []
+    var output: Texture
+    var transpose: [Int] = []
+    let axis: Int
 }
 
 class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ConcatOp<P>
-
-  func inferShape() {
-    //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
-    //        para.output.dim = Dim.init(inDim: dim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ConcatOp<P>
+    
+    func inferShape() {
+        //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
+        //        para.output.dim = Dim.init(inDim: dim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
     
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
-  
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
index 552d72f436..e7865045e5 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
@@ -16,94 +16,94 @@ import Foundation
 import Metal
 
 class ConvAddAddPreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-      y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+            y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  let mode: String
-  let alpha: Tensor<P>
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    let mode: String
+    let alpha: Tensor<P>
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddAddPreluOp<P: PrecisionType>: Operator<ConvAddAddPreluKernel<P>, ConvAddAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvAddAddPreluOp<P>
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddAddPreluType
-  }
-  
-  static func needCheck() -> [(Int, String)] {
-    return [(2, "Y"), (2, "X")]
-  }
-  
-  
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvAddAddPreluOp<P>
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gConvAddAddPreluType
+    }
+    
+    static func needCheck() -> [(Int, String)] {
+        return [(2, "Y"), (2, "X")]
+    }
+    
+    
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
     }
-  }
-  
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
index 6aacd4208e..311967c22c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
@@ -16,115 +16,115 @@ import Foundation
 
 
 class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      
-      filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-      
-      groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-      bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-      
-      scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-      mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-      y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            
+            filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+            
+            groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+            
+            scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+            mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+            y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  
-  let variance: Tensor<P>
-  let bias: Tensor<P>
-  let mean: Tensor<P>
-  let scale: Tensor<P>
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  let epsilon: Float32
-  var newScale: MTLBuffer?
-  var newBiase: MTLBuffer?
-  
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    
+    let variance: Tensor<P>
+    let bias: Tensor<P>
+    let mean: Tensor<P>
+    let scale: Tensor<P>
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    let epsilon: Float32
+    var newScale: MTLBuffer?
+    var newBiase: MTLBuffer?
+    
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKernel<P>, ConvAddBatchNormReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  
-  typealias OpType = ConvAddBatchNormReluOp<P>
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    typealias OpType = ConvAddBatchNormReluOp<P>
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType)
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
     }
-  }
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType)
-      --> Node.init(inType: gBatchNormType)
-      --> Node.init(inType: gReluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddBatchNormReluType
-  }
-  
-  func delogOutput() {
-    print(" conv add batchnorm relu output ")
-    print(para.output.toTensor().strideArray())
-    //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
-    //        para.filter.logDataPointer(header: "filter data pointer: ")
-    //        print("filter: \(para.filter)")
     
-    //        print("biase: \(para.y)")
-    //        print("padding: \(para.paddings)")
-    //        print("stride: \(para.stride)")
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
     
-    //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
-    //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
-    //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+    static func fusionType() -> String {
+        return gConvAddBatchNormReluType
+    }
     
-    //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
-  }
+    func delogOutput() {
+        print(" conv add batchnorm relu output ")
+        print(para.output.toTensor().strideArray())
+        //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+        //        para.filter.logDataPointer(header: "filter data pointer: ")
+        //        print("filter: \(para.filter)")
+        
+        //        print("biase: \(para.y)")
+        //        print("padding: \(para.paddings)")
+        //        print("stride: \(para.stride)")
+        
+        //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+        //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+        //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+        
+        //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
index 923c2c210d..7b9958a066 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
@@ -15,103 +15,103 @@
 import Foundation
 
 class ConvAddParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      
-      y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            
+            y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvAddOp<P>
-
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddType
-  }
-  
-  func inferShape() {
+    typealias OpType = ConvAddOp<P>
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType)
+        return beginNode
+    }
     
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    static func fusionType() -> String {
+        return gConvAddType
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func inferShape() {
+        
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-  }
-  
-  func delogOutput() {
-//    print("op \(type): ")
-//    print(" padding: ")
-//    print(para.paddings)
-//    print("stride: ")
-//    print(para.stride)
-//    print("dilations: ")
-//    print(para.dilations)
-//    print(" para input dim: ")
-//    print(para.input.dim)
-//    print(" para filter dim: ")
-//    print(para.filter.dim)
-//    print(" para output dim: ")
-//    print(para.output.dim)
-//    print(" biase: ")
-//    let biase: [Float32] = para.y.buffer.array()
-//    print(biase)
     
-    print(" \(type) output: ")
-    print(para.output.metalTexture)
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-  }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        //    print("op \(type): ")
+        //    print(" padding: ")
+        //    print(para.paddings)
+        //    print("stride: ")
+        //    print(para.stride)
+        //    print("dilations: ")
+        //    print(para.dilations)
+        //    print(" para input dim: ")
+        //    print(para.input.dim)
+        //    print(" para filter dim: ")
+        //    print(para.filter.dim)
+        //    print(" para output dim: ")
+        //    print(para.output.dim)
+        //    print(" biase: ")
+        //    let biase: [Float32] = para.y.buffer.array()
+        //    print(biase)
+        
+        print(" \(type) output: ")
+        print(para.output.metalTexture)
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
index 1c0bbba8d9..dc32056224 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
@@ -15,87 +15,87 @@
 import Foundation
 
 class ConvAddPreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-      y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+            y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  let mode: String
-  let alpha: Tensor<P>
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    let mode: String
+    let alpha: Tensor<P>
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvAddPreluOp<P>
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddPreluType
-  }
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvAddPreluOp<P>
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+        return beginNode
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    static func fusionType() -> String {
+        return gConvAddPreluType
+    }
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
index 423e55e391..1a973c51ef 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
@@ -15,101 +15,101 @@
 import Foundation
 
 class ConvBNReluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-      
-      groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-      bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-      scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-      mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+            
+            groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+            scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+            mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let variance: Tensor<P>
-  let bias: Tensor<P>
-  let mean: Tensor<P>
-  let scale: Tensor<P>
-  let filter: Tensor<P>
-  let epsilon: Float32
-  var newScale: MTLBuffer?
-  var newBiase: MTLBuffer?
-  
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let variance: Tensor<P>
+    let bias: Tensor<P>
+    let mean: Tensor<P>
+    let scale: Tensor<P>
+    let filter: Tensor<P>
+    let epsilon: Float32
+    var newScale: MTLBuffer?
+    var newBiase: MTLBuffer?
+    
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvBNReluOp<P>
-  
-  func inputs() -> [Variant] {
-    return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
-  }
-  
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvBNReluOp<P>
+    
+    func inputs() -> [Variant] {
+        return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
     }
-  }
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gBatchNormType)
-      --> Node.init(inType: gReluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvBnReluType
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
-  
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gConvBnReluType
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
index c66813b166..2d402ae431 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
@@ -15,67 +15,67 @@
 import Foundation
 
 class ConvParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let filter: Tensor<P>
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let filter: Tensor<P>
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-  typealias OpType = ConvOp<P>
-
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvOp<P>
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print("conv output : ")
+        print(para.output.toTensor().strideArray())
+        //        let _: Float16? = para.output.metalTexture.logDesc()
     }
-  }
-  
-  func delogOutput() {
-    print("conv output : ")
-    print(para.output.toTensor().strideArray())
-    //        let _: Float16? = para.output.metalTexture.logDesc()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
index c035f403a6..8322263e7c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
@@ -15,44 +15,44 @@
 import Foundation
 
 class ConvTransposeParam<P: PrecisionType>: ConvParam<P> {
-  //typealias ParamPrecisionType = P
+    //typealias ParamPrecisionType = P
     required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      try super.init(opDesc: opDesc, inScope: inScope)
-    } catch let error {
-      throw error
+        do {
+            try super.init(opDesc: opDesc, inScope: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
 
 class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTransposeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ConvTransposeOp<P>
-  
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ConvTransposeOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-  
-    print(" \(type) output: ")
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-      print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
-      print(output.strideArray())
-    } else {
-      print(" not implement")
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        
+        print(" \(type) output: ")
+        let padToFourDim = para.output.padToFourDim
+        if para.output.transpose == [0, 1, 2, 3] {
+            let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+            print(outputArray.strideArray())
+        } else if para.output.transpose == [0, 2, 3, 1] {
+            let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
+            print(output.strideArray())
+        } else {
+            print(" not implement")
+        }
     }
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
index 96818a9fd8..4686501fdd 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-
-  typealias OpType = DepthConvOp<P>
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    typealias OpType = DepthConvOp<P>
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
index 8575cfd88c..ef3bc21316 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
@@ -15,56 +15,56 @@
 import Foundation
 
 class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvBNReluOp<P>
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvBNReluOp<P>
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gDepthConvType)
+        _ = beginNode
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gDwConvBnReluType
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
     }
-  }
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gDepthConvType)
-    _ = beginNode
-      --> Node.init(inType: gBatchNormType)
-      --> Node.init(inType: gReluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gDwConvBnReluType
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
index 5fa69d4f44..cd5307b584 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
@@ -16,80 +16,80 @@ import Foundation
 import Metal
 
 class ElementwiseAddParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        do {
+            inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch _ {
+            let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+            let device = inputX.metalTexture!.device
+            inputY = Texture.init(device: device, inDim: tensorY.dim)
+            let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+            inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
+        }
+        
+        //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+        //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+        //      if computePrecision == .Float32 {
+        //        super.init(device: device, inFunctionName: "elementwise_add")
+        //      } else if computePrecision == .Float16 {
+        //        super.init(device: device, inFunctionName: "elementwise_add_half")
+        //      } else {
+        //        fatalError()
+        //      }
+        //    }
+        
+        var offset = axis
+        if axis == -1 {
+            offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+        }
+        for i in 0..<(inputY.tensorDim.cout()) {
+            assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+        }
     }
-    do {
-      inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch _ {
-      let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-      let device = inputX.metalTexture!.device
-      inputY = Texture.init(device: device, inDim: tensorY.dim)
-      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
-    }
-    
-//    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
-//      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
-//      if computePrecision == .Float32 {
-//        super.init(device: device, inFunctionName: "elementwise_add")
-//      } else if computePrecision == .Float16 {
-//        super.init(device: device, inFunctionName: "elementwise_add_half")
-//      } else {
-//        fatalError()
-//      }
-//    }
     
-    var offset = axis
-    if axis == -1 {
-      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
-    }
-    for i in 0..<(inputY.tensorDim.cout()) {
-      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
-    }
-  }
-  
-  var inputX: Texture
-  var inputY: Texture
-  var output: Texture
-  var axis: Int
+    var inputX: Texture
+    var inputY: Texture
+    var output: Texture
+    var axis: Int
 }
 
 class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, ElementwiseAddParam<P>>, Runable, Creator, InferShaperable{
-  typealias OpType = ElementwiseAddOp<P>
-  
-  func inferShape() {
-//    para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = ElementwiseAddOp<P>
+    
+    func inferShape() {
+        //    para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output)
     
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-      print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-    } else {
-      print(" not implement")
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output)
+        
+        let padToFourDim = para.output.padToFourDim
+        if para.output.transpose == [0, 1, 2, 3] {
+            let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+            print(outputArray.strideArray())
+        } else if para.output.transpose == [0, 2, 3, 1] {
+            print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        } else {
+            print(" not implement")
+        }
     }
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
index 6a49d7bfa2..bd853f6c0f 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
@@ -16,101 +16,101 @@ import Foundation
 import Metal
 
 class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-      inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+            inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        do {
+            inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch _ {
+            let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+            let device = inputX.metalTexture!.device
+            inputY = Texture.init(device: device, inDim: tensorY.dim)
+            let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+            inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
+        }
+        
+        //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+        //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+        //      if computePrecision == .Float32 {
+        //        super.init(device: device, inFunctionName: "elementwise_add")
+        //      } else if computePrecision == .Float16 {
+        //        super.init(device: device, inFunctionName: "elementwise_add_half")
+        //      } else {
+        //        fatalError()
+        //      }
+        //    }
+        
+        var offset = axis
+        if axis == -1 {
+            offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+        }
+        for i in 0..<(inputY.tensorDim.cout()) {
+            assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+        }
     }
-    do {
-      inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch _ {
-      let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-      let device = inputX.metalTexture!.device
-      inputY = Texture.init(device: device, inDim: tensorY.dim)
-      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
+    
+    let mode: String
+    let alpha: Tensor<P>
+    var inputX: Texture
+    var inputY: Texture
+    var output: Texture
+    var axis: Int
+}
+
+class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gElementwiseAddType)
+        _ = beginNode
+            --> Node.init(inType: gPreluType)
+        return beginNode
     }
     
-    //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
-    //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
-    //      if computePrecision == .Float32 {
-    //        super.init(device: device, inFunctionName: "elementwise_add")
-    //      } else if computePrecision == .Float16 {
-    //        super.init(device: device, inFunctionName: "elementwise_add_half")
-    //      } else {
-    //        fatalError()
-    //      }
-    //    }
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
     
-    var offset = axis
-    if axis == -1 {
-      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    static func fusionType() -> String {
+        return gElementwiseAddPreluType
     }
-    for i in 0..<(inputY.tensorDim.cout()) {
-      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    
+    typealias OpType = ElementwiseAddPreluOp<P>
+    
+    func inferShape() {
+        //    para.output.dim = para.input.dim
     }
-  }
-  
-  let mode: String
-  let alpha: Tensor<P>
-  var inputX: Texture
-  var inputY: Texture
-  var output: Texture
-  var axis: Int
-}
-
-class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gElementwiseAddType)
-    _ = beginNode
-      --> Node.init(inType: gPreluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gElementwiseAddPreluType
-  }
-  
-  typealias OpType = ElementwiseAddPreluOp<P>
-  
-  func inferShape() {
-    //    para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output)
     
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-      print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-    } else {
-      print(" not implement")
+    
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output)
+        
+        let padToFourDim = para.output.padToFourDim
+        if para.output.transpose == [0, 1, 2, 3] {
+            let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+            print(outputArray.strideArray())
+        } else if para.output.transpose == [0, 2, 3, 1] {
+            print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        } else {
+            print(" not implement")
+        }
     }
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
index 46defcb583..bab3d8dce7 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
@@ -17,54 +17,54 @@ import MetalKit
 import CoreMedia
 
 class FeedParam<P: PrecisionType>: OpParam{
-  var output: Texture
-  var input: InputTexture {
-    return scope.input() as! InputTexture
-  }
-  let scope: Scope
-  
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    scope = inScope
-    do {
-      output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
+    var output: Texture
+    var input: InputTexture {
+        return scope.input() as! InputTexture
     }
-  }
-  
-  //typealias ParamPrecisionType = P
+    let scope: Scope
+    
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        scope = inScope
+        do {
+            output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    //typealias ParamPrecisionType = P
 }
 
 class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<P>>, Runable, Creator, InferShaperable {
-  typealias OpType = FeedOp<P>
-
-  func inferShape() {
-    //        print("feed  input: \(para.input.expectDim)")
-    print("feed output: \(para.output.dim)")
-    //        para.output.dim =
-    //        para.output.dim = para.input.expectDim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = FeedOp<P>
+    
+    func inferShape() {
+        //        print("feed  input: \(para.input.expectDim)")
+        print("feed output: \(para.output.dim)")
+        //        para.output.dim =
+        //        para.output.dim = para.input.expectDim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+        
+        //        let resizeKernel = ResizeKernel<P>.init(device: device)
+        //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
+        //        do {
+        //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
+        //        } catch let error {
+        //            throw error
+        //        }
     }
     
-    //        let resizeKernel = ResizeKernel<P>.init(device: device)
-    //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
-    //        do {
-    //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
-    //        } catch let error {
-    //            throw error
-    //        }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture)
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray())
-  }
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture)
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray())
+    }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
index a5d04a4b03..671c2f33fa 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
@@ -16,43 +16,43 @@ import Foundation
 import Metal
 
 class FetchParam<P: PrecisionType>: OpParam{
-  var output: FetchHolder
-  let input: Texture
-  let scope: Scope
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    scope = inScope
-    do {
-      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim)
-      scope.setOutput(output: output)
-    } catch let error {
-      throw error
+    var output: FetchHolder
+    let input: Texture
+    let scope: Scope
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        scope = inScope
+        do {
+            input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim)
+            scope.setOutput(output: output)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  //typealias ParamPrecisionType = P
+    
+    //typealias ParamPrecisionType = P
 }
 
 class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
-  
-  typealias OpType = FetchOp<P>
-
-  func inferShape() {
-    print(para.input.dim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = FetchOp<P>
+    
+    func inferShape() {
+        print(para.input.dim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print("fetch output: ")
+        let resArr = self.para.output.result.floatArr(count: self.para.output.capacity)
+        print(resArr.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print("fetch output: ")
-    let resArr = self.para.output.result.floatArr(count: self.para.output.capacity)
-    print(resArr.strideArray())
-  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
index 8500798adc..b982990851 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
@@ -15,45 +15,45 @@
 import Foundation
 
 class FlattenParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let axis: Int
+    let input: Texture
+    var output: Texture
+    let axis: Int
 }
 
 
 class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = FlattenOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = FlattenOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
+    }
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
index a7aaa9eddc..43ce7927eb 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
@@ -19,125 +19,125 @@ public protocol TestParam {
 }
 
 public protocol Testable {
-  associatedtype TestParamType: TestParam
-  func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
-  init(device: MTLDevice, testParam: TestParamType, initContext: InitContext)
+    associatedtype TestParamType: TestParam
+    func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
+    init(device: MTLDevice, testParam: TestParamType, initContext: InitContext)
 }
 
 
 protocol Computable {
-  associatedtype ParamType: OpParam
-  func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
-  init(device: MTLDevice, param: ParamType, initContext: InitContext)
+    associatedtype ParamType: OpParam
+    func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
+    init(device: MTLDevice, param: ParamType, initContext: InitContext)
 }
 
 protocol KernelProtocol {
-  var pipline: MTLComputePipelineState { get set }
-  var functionName: String { get set }
-  
+    var pipline: MTLComputePipelineState { get set }
+    var functionName: String { get set }
+    
 }
 
 @objc open class Kernel: NSObject{
-  let pipline: MTLComputePipelineState
-  let functionName: String
-  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) {
-    pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-    functionName = inFunctionName
-  }
+    let pipline: MTLComputePipelineState
+    let functionName: String
+    public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) {
+        pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
+        functionName = inFunctionName
+    }
 }
 
 @objc public class Shape: NSObject {
-  public let width: Int
-  public let height: Int
-  public let channel: Int
-  @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){
-    width = inWidth
-    height = inHeight
-    channel = inChannel
-  }
+    public let width: Int
+    public let height: Int
+    public let channel: Int
+    @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){
+        width = inWidth
+        height = inHeight
+        channel = inChannel
+    }
 }
 
 open class BufferToTextureKernel: Kernel {
-  public let outputTexture: MTLTexture
-  
-  public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.textureType = .type2D
-    textureDesc.width = outputDim.width
-    textureDesc.height = outputDim.height
-    textureDesc.depth = (outputDim.channel + 3) / 4
+    public let outputTexture: MTLTexture
     
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      textureDesc.pixelFormat = .rgba16Float
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      textureDesc.pixelFormat = .rgba32Float
-    } else {
-      fatalError()
+    public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.textureType = .type2D
+        textureDesc.width = outputDim.width
+        textureDesc.height = outputDim.height
+        textureDesc.depth = (outputDim.channel + 3) / 4
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            textureDesc.pixelFormat = .rgba16Float
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            textureDesc.pixelFormat = .rgba32Float
+        } else {
+            fatalError()
+        }
+        
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.storageMode = .shared
+        outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+        let initContext = InitContext.init()
+        initContext.metalLibPath = metalLibPath
+        initContext.metalLoadMode = metalLoadMode
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext)
+        } else {
+            super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext)
+        }
     }
     
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.storageMode = .shared
-    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-    let initContext = InitContext.init()
-    initContext.metalLibPath = metalLibPath
-    initContext.metalLoadMode = metalLoadMode
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext)
-    } else {
-      super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext)
-    }
-  }
-  
-  public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setBuffer(inputBuffer, offset: 0, index: 0)
+        encoder.setTexture(outputTexture, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+        encoder.endEncoding()
     }
     
-    encoder.setBuffer(inputBuffer, offset: 0, index: 0)
-    encoder.setTexture(outputTexture, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
-    encoder.endEncoding()
-  }
-
 }
 
 @objc open class CusomKernel: Kernel {
-
-  public let outputTexture: MTLTexture
-  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) {
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.textureType = .type2D
-    textureDesc.width = outputDim.width
-    textureDesc.height = outputDim.height
-    textureDesc.depth = (outputDim.channel + 3) / 4
     
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      textureDesc.pixelFormat = .rgba16Float
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      textureDesc.pixelFormat = .rgba32Float
-    } else {
-      fatalError()
+    public let outputTexture: MTLTexture
+    public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) {
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.textureType = .type2D
+        textureDesc.width = outputDim.width
+        textureDesc.height = outputDim.height
+        textureDesc.depth = (outputDim.channel + 3) / 4
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            textureDesc.pixelFormat = .rgba16Float
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            textureDesc.pixelFormat = .rgba32Float
+        } else {
+            fatalError()
+        }
+        
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.storageMode = .shared
+        outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+        
+        let context = InitContext.init()
+        context.metalLoadMode = metalLoadModel
+        context.metalLibPath = metalLibPath
+        super.init(device: device, inFunctionName: inFunctionName, initContext: context)
     }
     
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.storageMode = .shared
-    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-    
-    let context = InitContext.init()
-    context.metalLoadMode = metalLoadModel
-    context.metalLibPath = metalLibPath
-    super.init(device: device, inFunctionName: inFunctionName, initContext: context)
-  }
-  
-  public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(inputTexuture, index: 0)
+        encoder.setTexture(outputTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(inputTexuture, index: 0)
-    encoder.setTexture(outputTexture, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
-    encoder.endEncoding()
-  }
-  
+    
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
index 9eeb2aff9c..0e2005b024 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
@@ -15,39 +15,39 @@
 import Foundation
 
 class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
-  required init(device: MTLDevice, param: BatchNormParam<P>, initContext: InitContext) {
-    let count = param.variance.dim.numel()
-    let varianceP = param.variance.data.pointer
-    let meanP = param.mean.data.pointer
-    let scaleP = param.scale.data.pointer
-    let biasP = param.bias.data.pointer
-    for i in 0..<count {
-      let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
-      biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
-      scaleP[i] = invStd * scaleP[i]
+    required init(device: MTLDevice, param: BatchNormParam<P>, initContext: InitContext) {
+        let count = param.variance.dim.numel()
+        let varianceP = param.variance.data.pointer
+        let meanP = param.mean.data.pointer
+        let scaleP = param.scale.data.pointer
+        let biasP = param.bias.data.pointer
+        for i in 0..<count {
+            let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
+            biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
+            scaleP[i] = invStd * scaleP[i]
+        }
+        
+        param.bias.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.scale.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "batchnorm", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "batchnorm_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-
-    param.bias.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.scale.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "batchnorm", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "batchnorm_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
+        encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
-    encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
index 0db2e98651..c8a6519085 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 struct BilinearInterpMetalParam {
-  var ratio_h: Float32
-  var ratio_w: Float32
+    var ratio_h: Float32
+    var ratio_w: Float32
 }
 
 class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        var ratio_h: Float32 = 0
+        var ratio_w: Float32 = 0
+        if param.output.tensorDim.dims[2] > 1 {
+            ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+        }
+        if param.output.tensorDim.dims[3] > 1 {
+            ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
+        }
+        var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+        encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    var ratio_h: Float32 = 0
-    var ratio_w: Float32 = 0
-    if param.output.tensorDim.dims[2] > 1 {
-      ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+    required init(device: MTLDevice, param: BilinearInterpParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    if param.output.tensorDim.dims[3] > 1 {
-      ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
-    }
-    var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-    encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: BilinearInterpParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
index 6e528a5965..8f295672c1 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
@@ -18,29 +18,29 @@ struct BoxcoderMetalParam {
 }
 
 class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.priorBox.metalTexture, index: 0)
+        encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
+        encoder.setTexture(param.targetBox.metalTexture, index: 2)
+        encoder.setTexture(param.output.metalTexture, index: 3)
+        var bmp = BoxcoderMetalParam.init()
+        encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.priorBox.metalTexture, index: 0)
-    encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
-    encoder.setTexture(param.targetBox.metalTexture, index: 2)
-    encoder.setTexture(param.output.metalTexture, index: 3)
-    var bmp = BoxcoderMetalParam.init()
-    encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: BoxcoderParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    required init(device: MTLDevice, param: BoxcoderParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
index edb0289688..195366c796 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
@@ -16,133 +16,133 @@ import Foundation
 import Metal
 
 struct ConcatTestParam: TestParam {
-  var input: [MTLTexture]
-  var output: MTLTexture
-  var dims: [[Int]]
-  var axis: Int
-  var odim: [Int]
+    var input: [MTLTexture]
+    var output: MTLTexture
+    var dims: [[Int]]
+    var axis: Int
+    var odim: [Int]
 }
 
 struct ConcatMetalParam {
-  var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
-  var axis: Int32 = 0
-  var offset: Int32 = 0
-  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-  var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
+    var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+    var axis: Int32 = 0
+    var offset: Int32 = 0
+    var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
 }
 
 class ConcatKernel<P: PrecisionType>: Kernel, Computable{
-  var v = "normal"
-  var pm = ConcatMetalParam.init()
-  func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
-    
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    let num = param.input.count
-    for i in 0..<num {
-      encoder.setTexture(param.input[i].metalTexture, index: i)
-    }
-    encoder.setTexture(param.output.metalTexture, index: num)
-    if v == "normal" {
-      encoder.setTexture(param.output.metalTexture, index: num + 1)
-    }
-    encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-
-  required init(device: MTLDevice, param: ConcatParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    let orank = param.output.tensorDim.cout()
-    let num = param.input.count
-    assert(num <= 6)
-    var axis = 4 - param.output.tensorDim.cout() + param.axis
-    for i in 0..<4 {
-      if param.transpose[i] == axis {
-        axis = i
-        break
-      }
-    }
-    pm.axis = Int32(axis)
-    pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
-    pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
-    var vdim: [Int] = [0, 0, 0, 0, 0, 0]
-    for i in 0..<num {
-      vdim[i] = param.input[i].dim[axis]
-    }
-    if orank == 4 {
-      if axis == 1 {
-        v = "y"
-      } else if axis == 2 {
-        v = "x"
-      } else {
-        if (param.output.dim[0] == 1) && axis == 3 {
-          var vz = true
-          for i in 0..<num {
-            if vdim[i] % 4 != 0 {
-              vz = false
-              break
-            }
-          }
-          if vz {
-            v = "z"
-            for i in 0..<num {
-              vdim[i] = vdim[i] / 4
-            }
-          }
+    var v = "normal"
+    var pm = ConcatMetalParam.init()
+    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+        
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
         }
-      }
-    } else if orank == 3 {
-      if axis == 2 {
-        v = "y"
-      } else if axis == 3 {
-        v = "x"
-      } else if axis == 1 {
-        var vz = true
+        let num = param.input.count
         for i in 0..<num {
-          if vdim[i] % 4 != 0 {
-            vz = false
-            break
-          }
+            encoder.setTexture(param.input[i].metalTexture, index: i)
         }
-        if vz {
-          v = "z"
-          for i in 0..<num {
-            vdim[i] = vdim[i] / 4
-          }
+        encoder.setTexture(param.output.metalTexture, index: num)
+        if v == "normal" {
+            encoder.setTexture(param.output.metalTexture, index: num + 1)
         }
-      }
-    } else {
-      if axis == 2 {
-        v = "y"
-      } else if axis == 3 {
-        var vx = true
+        encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: ConcatParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        let orank = param.output.tensorDim.cout()
+        let num = param.input.count
+        assert(num <= 6)
+        var axis = 4 - param.output.tensorDim.cout() + param.axis
+        for i in 0..<4 {
+            if param.transpose[i] == axis {
+                axis = i
+                break
+            }
+        }
+        pm.axis = Int32(axis)
+        pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
+        pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
+        var vdim: [Int] = [0, 0, 0, 0, 0, 0]
         for i in 0..<num {
-          if vdim[i] % 4 != 0 {
-            vx = false
-            break
-          }
+            vdim[i] = param.input[i].dim[axis]
         }
-        if vx {
-          v = "x"
-          for i in 0..<num {
-            vdim[i] = vdim[i] / 4
-          }
+        if orank == 4 {
+            if axis == 1 {
+                v = "y"
+            } else if axis == 2 {
+                v = "x"
+            } else {
+                if (param.output.dim[0] == 1) && axis == 3 {
+                    var vz = true
+                    for i in 0..<num {
+                        if vdim[i] % 4 != 0 {
+                            vz = false
+                            break
+                        }
+                    }
+                    if vz {
+                        v = "z"
+                        for i in 0..<num {
+                            vdim[i] = vdim[i] / 4
+                        }
+                    }
+                }
+            }
+        } else if orank == 3 {
+            if axis == 2 {
+                v = "y"
+            } else if axis == 3 {
+                v = "x"
+            } else if axis == 1 {
+                var vz = true
+                for i in 0..<num {
+                    if vdim[i] % 4 != 0 {
+                        vz = false
+                        break
+                    }
+                }
+                if vz {
+                    v = "z"
+                    for i in 0..<num {
+                        vdim[i] = vdim[i] / 4
+                    }
+                }
+            }
+        } else {
+            if axis == 2 {
+                v = "y"
+            } else if axis == 3 {
+                var vx = true
+                for i in 0..<num {
+                    if vdim[i] % 4 != 0 {
+                        vx = false
+                        break
+                    }
+                }
+                if vx {
+                    v = "x"
+                    for i in 0..<num {
+                        vdim[i] = vdim[i] / 4
+                    }
+                }
+            }
+        }
+        pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half", initContext: initContext)
+        } else {
+            fatalError()
         }
-      }
     }
-    pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    required init(device: MTLDevice, testParam: ConcatTestParam, initContext: InitContext) {
+        super.init(device: device, inFunctionName: "concat", initContext: initContext)
     }
-  }
-  
-  required init(device: MTLDevice, testParam: ConcatTestParam, initContext: InitContext) {
-    super.init(device: device, inFunctionName: "concat", initContext: initContext)
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
index 650f1b4497..b2dd306446 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -15,136 +15,136 @@
 import Foundation
 
 class ConvAddAddPreluKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
-        }
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvAddAddPreluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
         
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
         } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+            fatalError()
         }
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
+        let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else {
-      fatalError()
+        let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+        
+        //    print(" function: \(functionName)")
+        //    print("offset x: \(offsetX)")
+        //    print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        //    print("metal param: ")
+        //    print(inMetalParam)
+        
+        metalParam = inMetalParam
     }
     
-    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
-    
-    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
-    
-    //    print(" function: \(functionName)")
-    //    print("offset x: \(offsetX)")
-    //    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    //    print("metal param: ")
-    //    print(inMetalParam)
-    
-    metalParam = inMetalParam
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index 6274e3df8f..0ff0b57f6c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -16,165 +16,165 @@ import Foundation
 import Metal
 
 struct ConvAddBatchNormReluTestParam: TestParam {
-  let inputTexture: MTLTexture
-  let outputTexture: MTLTexture
-  var metalParam: MetalConvParam
-  let filterBuffer: MTLBuffer
-  let biaseBuffer: MTLBuffer
-  let newScaleBuffer: MTLBuffer
-  let newBiaseBuffer: MTLBuffer
-  let filterSize: (width: Int, height: Int, channel: Int)
-  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-    inputTexture = inInputTexture
-    outputTexture = inOutputTexture
-    metalParam = inMetalParam
-    filterBuffer = inFilterBuffer
-    biaseBuffer = inBiaseBuffer
-    newScaleBuffer = inNewScaleBuffer
-    newBiaseBuffer = inNewBiaseBuffer
-    filterSize = inFilterSize
-  }
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    var metalParam: MetalConvParam
+    let filterBuffer: MTLBuffer
+    let biaseBuffer: MTLBuffer
+    let newScaleBuffer: MTLBuffer
+    let newBiaseBuffer: MTLBuffer
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+        inputTexture = inInputTexture
+        outputTexture = inOutputTexture
+        metalParam = inMetalParam
+        filterBuffer = inFilterBuffer
+        biaseBuffer = inBiaseBuffer
+        newScaleBuffer = inNewScaleBuffer
+        newBiaseBuffer = inNewBiaseBuffer
+        filterSize = inFilterSize
+    }
 }
 
 class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) {
-    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
-    } else if testParam.filterSize.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
-    } else {
-      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
+    required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) {
+        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
+        } else if testParam.filterSize.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
+        } else {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
+        }
     }
-  }
-  
-  var metalParam: MetalConvParam!
-  
-  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.variance.initBuffer(device: device, precision: .Float32)
-    param.mean.initBuffer(device: device, precision: .Float32)
-    param.scale.initBuffer(device: device, precision: .Float32)
-    param.bias.initBuffer(device: device, precision: .Float32)
     
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else {
-      fatalError()
+    var metalParam: MetalConvParam!
+    
+    required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.variance.initBuffer(device: device, precision: .Float32)
+        param.mean.initBuffer(device: device, precision: .Float32)
+        param.scale.initBuffer(device: device, precision: .Float32)
+        param.bias.initBuffer(device: device, precision: .Float32)
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        let offsetX = param.filter.width/2 - Int(param.paddings[0])
+        let offsetY = param.filter.height/2 - Int(param.paddings[1])
+        
+        print("offset x: \(offsetX)")
+        print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        
+        var invs: [P] = []
+        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+            invs.append(P(inv))
+        }
+        
+        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+        
+        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+            newScale[i] = invs[i] * scaleContents[i]
+            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+        }
+        
+        //    var newScaleFP16: UnsafeMutableRawPointer
+        //
+        //    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
+        
+        
+        //    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
+        
+        var newBiaseBuffer: MTLBuffer
+        var newScaleBuffer: MTLBuffer
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+            newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            
+            newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            
+            float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+            
+            float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+        } else {
+            fatalError(" unsupport ")
+        }
+        
+        param.newBiase = newBiaseBuffer
+        param.newScale = newScaleBuffer
+        
+        newScale.deinitialize(count: param.scale.buffer.length)
+        newScale.deallocate()
+        
+        newBiase.deinitialize(count: param.bias.buffer.length)
+        newBiase.deallocate()
     }
     
-    let offsetX = param.filter.width/2 - Int(param.paddings[0])
-    let offsetY = param.filter.height/2 - Int(param.paddings[1])
-    
-    print("offset x: \(offsetX)")
-    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    
-    var invs: [P] = []
-    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-    
-    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-      invs.append(P(inv))
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-    
-    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-      newScale[i] = invs[i] * scaleContents[i]
-      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        
+        encoder.setTexture(param.inputTexture, index: 0)
+        encoder.setTexture(param.outputTexture, index: 1)
+        var inMetalParam = param.metalParam
+        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+        encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+        encoder.endEncoding()
     }
-    
-//    var newScaleFP16: UnsafeMutableRawPointer
-//
-//    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
-    
-    
-//    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
-    
-    var newBiaseBuffer: MTLBuffer
-    var newScaleBuffer: MTLBuffer
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
-      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      
-      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      
-      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
-      
-      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
-    } else {
-      fatalError(" unsupport ")
-    }
-    
-    param.newBiase = newBiaseBuffer
-    param.newScale = newScaleBuffer
-    
-    newScale.deinitialize(count: param.scale.buffer.length)
-    newScale.deallocate()
-    
-    newBiase.deinitialize(count: param.bias.buffer.length)
-    newBiase.deallocate()
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-    encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      fatalError()
-    }
-    
-    encoder.setTexture(param.inputTexture, index: 0)
-    encoder.setTexture(param.outputTexture, index: 1)
-    var inMetalParam = param.metalParam
-    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-    encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
-    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
-    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
-    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
index 0ba448161f..d40fa7e445 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -15,74 +15,74 @@
 import Foundation
 
 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext)
-      } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext)
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext)
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext)
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext)
-      } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext)
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext)
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext)
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else {
-      fatalError()
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext)
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext)
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext)
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext)
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext)
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext)
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext)
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext)
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        
+        
+        let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+        
+        let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+        
+        //    print(" function: \(functionName)")
+        //    print("offset x: \(offsetX)")
+        //    print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        //    print("metal param: ")
+        //    print(inMetalParam)
+        
+        metalParam = inMetalParam
     }
     
-
-    
-    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
-    
-    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
-    
-//    print(" function: \(functionName)")
-//    print("offset x: \(offsetX)")
-//    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-//    print("metal param: ")
-//    print(inMetalParam)
-    
-    metalParam = inMetalParam
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
index 1d66696050..1b054cb9ca 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
@@ -15,136 +15,136 @@
 import Foundation
 
 class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
-        }
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvAddPreluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
         
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
         } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+            fatalError()
         }
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
+        let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else {
-      fatalError()
+        let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+        
+        //    print(" function: \(functionName)")
+        //    print("offset x: \(offsetX)")
+        //    print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        //    print("metal param: ")
+        //    print(inMetalParam)
+        
+        metalParam = inMetalParam
     }
     
-    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
-    
-    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
-    
-    //    print(" function: \(functionName)")
-    //    print("offset x: \(offsetX)")
-    //    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    //    print("metal param: ")
-    //    print(inMetalParam)
-    
-    metalParam = inMetalParam
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
index 81c53a57a8..415ec94b51 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
@@ -16,165 +16,165 @@ import Foundation
 import MetalPerformanceShaders
 
 struct ConvBNReluTestParam: TestParam {
-  let inputTexture: MTLTexture
-  let outputTexture: MTLTexture
-  var metalParam: MetalConvParam
-  let filterBuffer: MTLBuffer
-  let biaseBuffer: MTLBuffer
-  let newScaleBuffer: MTLBuffer
-  let newBiaseBuffer: MTLBuffer
-  let filterSize: (width: Int, height: Int, channel: Int)
-  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-    
-    inputTexture = inInputTexture
-    outputTexture = inOutputTexture
-    metalParam = inMetalParam
-    filterBuffer = inFilterBuffer
-    biaseBuffer = inBiaseBuffer
-    newScaleBuffer = inNewScaleBuffer
-    newBiaseBuffer = inNewBiaseBuffer
-    filterSize = inFilterSize
-  }
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    var metalParam: MetalConvParam
+    let filterBuffer: MTLBuffer
+    let biaseBuffer: MTLBuffer
+    let newScaleBuffer: MTLBuffer
+    let newBiaseBuffer: MTLBuffer
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+        
+        inputTexture = inInputTexture
+        outputTexture = inOutputTexture
+        metalParam = inMetalParam
+        filterBuffer = inFilterBuffer
+        biaseBuffer = inBiaseBuffer
+        newScaleBuffer = inNewScaleBuffer
+        newBiaseBuffer = inNewBiaseBuffer
+        filterSize = inFilterSize
+    }
 }
 
 class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-  required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) {
-    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
-    } else if testParam.filterSize.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
-    } else {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
+    required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) {
+        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+            super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
+        } else if testParam.filterSize.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
+        } else {
+            super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
+        }
     }
-  }
-  
-  var metalParam: MetalConvParam!
-
-  required init(device: MTLDevice, param: ConvBNReluParam<P>, initContext: InitContext) {
     
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.variance.initBuffer(device: device, precision: .Float32)
-    param.mean.initBuffer(device: device, precision: .Float32)
-    param.scale.initBuffer(device: device, precision: .Float32)
-    param.bias.initBuffer(device: device, precision: .Float32)
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else {
-      fatalError()
+    var metalParam: MetalConvParam!
+    
+    required init(device: MTLDevice, param: ConvBNReluParam<P>, initContext: InitContext) {
+        
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.variance.initBuffer(device: device, precision: .Float32)
+        param.mean.initBuffer(device: device, precision: .Float32)
+        param.scale.initBuffer(device: device, precision: .Float32)
+        param.bias.initBuffer(device: device, precision: .Float32)
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        
+        
+        let offsetX = param.filter.width/2 - Int(param.paddings[0])
+        let offsetY = param.filter.height/2 - Int(param.paddings[1])
+        
+        //    print(" param filter width: \(param.filter.width)")
+        //    print(" param filter height: \(param.filter.height)")
+        //
+        //    print(" param paddings: \(param.paddings)")
+        //
+        //    print("ConvBNReluKernel offset x: \(offsetX)")
+        //    print("ConvBNReluKernel offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        
+        var invs: [P] = []
+        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+            invs.append(P(inv))
+        }
+        
+        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+        
+        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+            newScale[i] = invs[i] * scaleContents[i]
+            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+        }
+        
+        var newBiaseBuffer: MTLBuffer
+        var newScaleBuffer: MTLBuffer
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+            newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            
+            newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            
+            float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+            
+            float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+        } else {
+            fatalError(" unsupport ")
+        }
+        
+        param.newBiase = newBiaseBuffer
+        param.newScale = newScaleBuffer
+        
+        newScale.deinitialize(count: param.scale.buffer.length)
+        newScale.deallocate()
+        
+        newBiase.deinitialize(count: param.bias.buffer.length)
+        newBiase.deallocate()
     }
     
-   
-    
-    let offsetX = param.filter.width/2 - Int(param.paddings[0])
-    let offsetY = param.filter.height/2 - Int(param.paddings[1])
-    
-//    print(" param filter width: \(param.filter.width)")
-//    print(" param filter height: \(param.filter.height)")
-//
-//    print(" param paddings: \(param.paddings)")
-//
-//    print("ConvBNReluKernel offset x: \(offsetX)")
-//    print("ConvBNReluKernel offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    
-    var invs: [P] = []
-    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-    
-    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-      invs.append(P(inv))
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.newScale!, offset: 0, index: 2)
+        encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-    
-    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-      newScale[i] = invs[i] * scaleContents[i]
-      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        
+        encoder.setTexture(param.inputTexture, index: 0)
+        encoder.setTexture(param.outputTexture, index: 1)
+        var inMetalParam = param.metalParam
+        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
+        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+        encoder.endEncoding()
     }
-    
-    var newBiaseBuffer: MTLBuffer
-    var newScaleBuffer: MTLBuffer
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
-      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      
-      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      
-      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
-      
-      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
-    } else {
-      fatalError(" unsupport ")
-    }
-    
-    param.newBiase = newBiaseBuffer
-    param.newScale = newScaleBuffer
-    
-    newScale.deinitialize(count: param.scale.buffer.length)
-    newScale.deallocate()
-    
-    newBiase.deinitialize(count: param.bias.buffer.length)
-    newBiase.deallocate()
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.newScale!, offset: 0, index: 2)
-    encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      fatalError()
-    }
-    
-    encoder.setTexture(param.inputTexture, index: 0)
-    encoder.setTexture(param.outputTexture, index: 1)
-    var inMetalParam = param.metalParam
-    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
-    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
index 7571bc155b..7ff040219e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
@@ -15,48 +15,46 @@
 import Foundation
 
 public struct MetalConvParam {
-  let offsetX: Int16
-  let offsetY: Int16
-  let offsetZ: Int16
-  let strideX: UInt16
-  let strideY: UInt16
-  let dilationX: UInt16
-  let dilationY: UInt16
+    let offsetX: Int16
+    let offsetY: Int16
+    let offsetZ: Int16
+    let strideX: UInt16
+    let strideY: UInt16
+    let dilationX: UInt16
+    let dilationY: UInt16
 }
 
 class ConvKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) {
-    param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
-    if param.filter.width == 1 && param.filter.height == 1 {
-      super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext)
-    } else if param.filter.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
-    } else if param.filter.width == 3 && param.filter.height == 3 {
-      super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
-    } else {
-      fatalError(" unsupport ")
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) {
+        param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
+        if param.filter.width == 1 && param.filter.height == 1 {
+            super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext)
+        } else if param.filter.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
+        } else if param.filter.width == 3 && param.filter.height == 3 {
+            super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
+        } else {
+            fatalError(" unsupport ")
+        }
+        
+        let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
+        let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
+        let offsetZ = 0.0
+        
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
     }
-
-    let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
-    let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
-    let offsetZ = 0.0
     
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
index c8b1361649..f1753d0a09 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
@@ -15,69 +15,69 @@
 import Foundation
 
 struct MetalConvTransposeParam {
-  let kernelW: UInt16;
-  let kernelH: UInt16;
-  
-  let strideX: UInt16;
-  let strideY: UInt16;
-  
-  let paddingX: UInt16;
-  let paddingY: UInt16;
-  
-  let dilationX: UInt16;
-  let dilationY: UInt16;
-}
-
-class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
-  var metalParam: MetalConvTransposeParam!
-  required init(device: MTLDevice, param: ConvTransposeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.stride == [2, 2] && param.stride == [2, 2] {
-        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext)
-      } else {
-        fatalError(" -- conv transpose unsupported yet -- ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.stride == [2, 2] && param.stride == [2, 2] {
-        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext)
-      } else {
-        fatalError(" -- conv transpose unsupported yet -- ")
-      }
-    } else {
-      fatalError()
-    }
+    let kernelW: UInt16;
+    let kernelH: UInt16;
     
-//    let filter: [Float32] = param.filter.buffer.array()
-//    print(" conv transpose filter")
-//    print(filter)
-    let kernelWidth = UInt16(param.filter.width)
-    let kernelHeight = UInt16(param.filter.height)
+    let strideX: UInt16;
+    let strideY: UInt16;
     
-    let strideX = UInt16(param.stride[0])
-    let strideY = UInt16(param.stride[1])
-    let paddingX = UInt16(param.paddings[0])
-    let paddingY = UInt16(param.paddings[1])
-    let dilationX = UInt16(param.dilations[0])
-    let dilationY = UInt16(param.dilations[1])
+    let paddingX: UInt16;
+    let paddingY: UInt16;
     
-    metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+    let dilationX: UInt16;
+    let dilationY: UInt16;
+}
 
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
+    var metalParam: MetalConvTransposeParam!
+    required init(device: MTLDevice, param: ConvTransposeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.stride == [2, 2] && param.stride == [2, 2] {
+                super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext)
+            } else {
+                fatalError(" -- conv transpose unsupported yet -- ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.stride == [2, 2] && param.stride == [2, 2] {
+                super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext)
+            } else {
+                fatalError(" -- conv transpose unsupported yet -- ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        //    let filter: [Float32] = param.filter.buffer.array()
+        //    print(" conv transpose filter")
+        //    print(filter)
+        let kernelWidth = UInt16(param.filter.width)
+        let kernelHeight = UInt16(param.filter.height)
+        
+        let strideX = UInt16(param.stride[0])
+        let strideY = UInt16(param.stride[1])
+        let paddingX = UInt16(param.paddings[0])
+        let paddingY = UInt16(param.paddings[1])
+        let dilationX = UInt16(param.dilations[0])
+        let dilationY = UInt16(param.dilations[1])
+        
+        metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+        
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
index 21108de10e..2a87d4362f 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
@@ -15,59 +15,59 @@
 import Foundation
 
 struct ElementwiseAddMetalParam {
-  var fast: Int32 = 0
-  var axis: Int32 = 0
-  var ylen: Int32 = 0
-  var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
-  var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-  var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
-  var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var fast: Int32 = 0
+    var axis: Int32 = 0
+    var ylen: Int32 = 0
+    var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+    var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+    var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
 }
 
 class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: ElementwiseAddMetalParam
-  required init(device: MTLDevice, param: ElementwiseAddParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    
-    metalParam = ElementwiseAddMetalParam.init()
-    
-    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
-    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
-    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
-    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
-    
-    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
-    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
-    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
-    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
-    if param.axis == -1 {
-      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
-    } else {
-      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    var metalParam: ElementwiseAddMetalParam
+    required init(device: MTLDevice, param: ElementwiseAddParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        
+        metalParam = ElementwiseAddMetalParam.init()
+        
+        let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+        let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+        let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+        let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+        
+        metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+        metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+        metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+        metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+        if param.axis == -1 {
+            metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+        } else {
+            metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+        }
+        metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+        if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+            //      print("===> elementwise_add fast!!!")
+            metalParam.fast = 1
+        }
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
-    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-      //      print("===> elementwise_add fast!!!")
-      metalParam.fast = 1
-    }
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.inputX.metalTexture, index: 0)
+        encoder.setTexture(param.inputY.metalTexture, index: 1)
+        encoder.setTexture(param.output.metalTexture, index: 2)
+        encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.inputX.metalTexture, index: 0)
-    encoder.setTexture(param.inputY.metalTexture, index: 1)
-    encoder.setTexture(param.output.metalTexture, index: 2)
-    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
index a423a119f3..cf83c2e750 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
@@ -16,64 +16,64 @@ import Foundation
 
 
 class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: ElementwiseAddMetalParam
-  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-   
-    metalParam = ElementwiseAddMetalParam.init()
-    
-    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
-    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
-    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
-    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
-    
-    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
-    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
-    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
-    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
-    if param.axis == -1 {
-      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
-    } else {
-      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
-    }
-    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
-    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-      //      print("===> elementwise_add fast!!!")
-      metalParam.fast = 1
+    var metalParam: ElementwiseAddMetalParam
+    required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        
+        metalParam = ElementwiseAddMetalParam.init()
+        
+        let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+        let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+        let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+        let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+        
+        metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+        metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+        metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+        metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+        if param.axis == -1 {
+            metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+        } else {
+            metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+        }
+        metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+        if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+            //      print("===> elementwise_add fast!!!")
+            metalParam.fast = 1
+        }
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext)
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
+            }
+        } else {
+            fatalError()
+        }
     }
     
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext)
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-      }
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.inputX.metalTexture, index: 0)
+        encoder.setTexture(param.inputY.metalTexture, index: 1)
+        encoder.setTexture(param.output.metalTexture, index: 2)
+        encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.inputX.metalTexture, index: 0)
-    encoder.setTexture(param.inputY.metalTexture, index: 1)
-    encoder.setTexture(param.output.metalTexture, index: 2)
-    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
index 7d6e68e699..616fcc1f2d 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
@@ -15,47 +15,47 @@
 import Foundation
 
 class FetchKernel<P: PrecisionType>: Kernel, Computable {
-  
-  required init(device: MTLDevice, param: FetchParam<P>, initContext: InitContext) {
-    param.output.initBuffer(device: device)
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.input.transpose == [0, 2, 3, 1] {
-        super.init(device: device, inFunctionName: "fetch_half", initContext: initContext)
-      } else if param.input.transpose == [0, 1, 2, 3] {
-        switch param.input.tensorDim.cout() {
-        case 1, 2:
-          super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext)
-        default:
-          fatalError(" not support ")
+    
+    required init(device: MTLDevice, param: FetchParam<P>, initContext: InitContext) {
+        param.output.initBuffer(device: device)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.input.transpose == [0, 2, 3, 1] {
+                super.init(device: device, inFunctionName: "fetch_half", initContext: initContext)
+            } else if param.input.transpose == [0, 1, 2, 3] {
+                switch param.input.tensorDim.cout() {
+                case 1, 2:
+                    super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext)
+                default:
+                    fatalError(" not support ")
+                }
+            } else {
+                fatalError(" not support ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.input.transpose == [0, 2, 3, 1] {
+                super.init(device: device, inFunctionName: "fetch_float", initContext: initContext)
+            } else if param.input.transpose == [0, 1, 2, 3] {
+                switch param.input.tensorDim.cout() {
+                case 1, 2:
+                    super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext)
+                default:
+                    fatalError(" not support ")
+                }
+            } else {
+                fatalError(" not support ")
+            }
+        } else {
+            fatalError(" not support ")
         }
-      } else {
-        fatalError(" not support ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.input.transpose == [0, 2, 3, 1] {
-        super.init(device: device, inFunctionName: "fetch_float", initContext: initContext)
-      } else if param.input.transpose == [0, 1, 2, 3] {
-        switch param.input.tensorDim.cout() {
-        case 1, 2:
-          super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext)
-        default:
-          fatalError(" not support ")
-        }
-      } else {
-        fatalError(" not support ")
-      }
-    } else {
-      fatalError(" not support ")
     }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
index 06a6537e1f..5956806001 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
@@ -15,57 +15,57 @@
 import Foundation
 
 struct FlattenMetalParam {
-  var idim: (Int32, Int32, Int32, Int32)
-  var itrans: (Int32, Int32, Int32, Int32)
-  var odim: (Int32, Int32, Int32, Int32)
-  var otrans: (Int32, Int32, Int32, Int32)
+    var idim: (Int32, Int32, Int32, Int32)
+    var itrans: (Int32, Int32, Int32, Int32)
+    var odim: (Int32, Int32, Int32, Int32)
+    var otrans: (Int32, Int32, Int32, Int32)
 }
 
 
 class FlattenKernel<P: PrecisionType>: Kernel, Computable{
-  
-  var metalParam: FlattenMetalParam
-  
-  required init(device: MTLDevice, param: FlattenParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    var id: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.input.tensorDim.cout() {
-      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
-    }
-    let it: [Int32] = param.input.transpose.map { Int32($0) }
-    var od: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.output.tensorDim.cout() {
-      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
-    }
-    let ot: [Int32] = param.output.transpose.map { Int32($0) }
-    metalParam = FlattenMetalParam.init(
-      idim: (id[0], id[1], id[2], id[3]),
-      itrans: (it[0], it[1], it[2], it[3]),
-      odim: (od[0], od[1], od[2], od[3]),
-      otrans: (ot[0], ot[1], ot[2], ot[3])
-    )
-    let irank = param.input.tensorDim.cout()
-    let orank = param.output.tensorDim.cout()
-    assert(orank == 2)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    
+    var metalParam: FlattenMetalParam
+    
+    required init(device: MTLDevice, param: FlattenParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        var id: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.input.tensorDim.cout() {
+            id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+        }
+        let it: [Int32] = param.input.transpose.map { Int32($0) }
+        var od: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.output.tensorDim.cout() {
+            od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+        }
+        let ot: [Int32] = param.output.transpose.map { Int32($0) }
+        metalParam = FlattenMetalParam.init(
+            idim: (id[0], id[1], id[2], id[3]),
+            itrans: (it[0], it[1], it[2], it[3]),
+            odim: (od[0], od[1], od[2], od[3]),
+            otrans: (ot[0], ot[1], ot[2], ot[3])
+        )
+        let irank = param.input.tensorDim.cout()
+        let orank = param.output.tensorDim.cout()
+        assert(orank == 2)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_2_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_2_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-
-    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
+    func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
index d3fc5a3ac9..4f59bf9971 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
-  let pipline1: MTLComputePipelineState
-
-  required init(device: MTLDevice, param: MulticlassNMSParam<P>, initContext: InitContext) {
+    let pipline1: MTLComputePipelineState
     
-    param.middleOutput.initBuffer(device: device)
-    param.bboxOutput.initBuffer(device: device)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-      super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-      super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext)
-    } else {
-      fatalError( " unsupport precision " )
+    required init(device: MTLDevice, param: MulticlassNMSParam<P>, initContext: InitContext) {
+        
+        param.middleOutput.initBuffer(device: device)
+        param.bboxOutput.initBuffer(device: device)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
+            super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
+            super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext)
+        } else {
+            fatalError( " unsupport precision " )
+        }
+        
     }
     
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.scores.metalTexture, index: 0)
+        encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
+        encoder.endEncoding()
+        
+        guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
+        encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
+        encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
+        encoderBox.endEncoding()
     }
-    
-    encoder.setTexture(param.scores.metalTexture, index: 0)
-    encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
-    encoder.endEncoding()
-    
-    guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    
-    encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
-    encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
-    encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
-    encoderBox.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
index b6833a4f93..37878f26d0 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
@@ -15,57 +15,57 @@
 import Foundation
 
 struct PoolMetalParam {
-  let ksizeX: Int32
-  let ksizeY: Int32
-  let strideX: Int32
-  let strideY: Int32
-  let paddingX: Int32
-  let paddingY: Int32
-  let poolType: Int32
+    let ksizeX: Int32
+    let ksizeY: Int32
+    let strideX: Int32
+    let strideY: Int32
+    let paddingX: Int32
+    let paddingY: Int32
+    let poolType: Int32
 }
 
 class PoolKernel<P: PrecisionType>: Kernel, Computable{
-  var metalParam: PoolMetalParam
-  required init(device: MTLDevice, param: PoolParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    
-    var poolType: Int32
-    switch param.poolType {
-    case "max":
-      poolType = 0
-    case "avg":
-      poolType = 1
-    default:
-      fatalError()
+    var metalParam: PoolMetalParam
+    required init(device: MTLDevice, param: PoolParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        
+        var poolType: Int32
+        switch param.poolType {
+        case "max":
+            poolType = 0
+        case "avg":
+            poolType = 1
+        default:
+            fatalError()
+        }
+        metalParam = PoolMetalParam.init(
+            ksizeX: param.ksize[0],
+            ksizeY: param.ksize[1],
+            strideX: param.stride[0],
+            strideY: param.stride[1],
+            paddingX: param.padding[0],
+            paddingY: param.padding[1],
+            poolType: poolType
+        )
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "pool_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "pool_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    metalParam = PoolMetalParam.init(
-      ksizeX: param.ksize[0],
-      ksizeY: param.ksize[1],
-      strideX: param.stride[0],
-      strideY: param.stride[1],
-      paddingX: param.padding[0],
-      paddingY: param.padding[1],
-      poolType: poolType
-    )
     
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "pool_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "pool_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-
-    encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
index 61a21331a6..053cb827e3 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
@@ -15,39 +15,39 @@
 import Foundation
 
 class PreluKernel<P: PrecisionType>: Kernel, Computable{
-  required init(device: MTLDevice, param: PreluParam<P>, initContext: InitContext) {
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "prelu_element", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prelu_other", initContext: initContext)
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext)
-      }
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    required init(device: MTLDevice, param: PreluParam<P>, initContext: InitContext) {
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "prelu_element", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prelu_other", initContext: initContext)
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext)
+            }
+        } else {
+            fatalError()
+        }
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
+    func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
index 15126bbc83..cb8ef81de3 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
@@ -15,136 +15,136 @@
 import Foundation
 
 struct PriorBoxMetalParam {
-  let offset: Float32
-  let stepWidth: Float32
-  let stepHeight: Float32
-  let minSize: Float32
-  let maxSize: Float32
-  let imageWidth: Float32
-  let imageHeight: Float32
-  let clip: Bool
-  let numPriors: uint
-  let aspecRatiosSize: uint
-  let minSizeSize: uint
-  let maxSizeSize: uint
+    let offset: Float32
+    let stepWidth: Float32
+    let stepHeight: Float32
+    let minSize: Float32
+    let maxSize: Float32
+    let imageWidth: Float32
+    let imageHeight: Float32
+    let clip: Bool
+    let numPriors: uint
+    let aspecRatiosSize: uint
+    let minSizeSize: uint
+    let maxSizeSize: uint
 }
 
 class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
-  var metalParam: PriorBoxMetalParam!
-  
-  required init(device: MTLDevice, param: PriorBoxParam<P>, initContext: InitContext) {
-    
-    let originDim = param.output.tensorDim;
-    
-    param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
-    param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
-    
-    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
-    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision)
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.min_max_aspect_ratios_order {
-        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prior_box", initContext: initContext)
-      }
-      
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.min_max_aspect_ratios_order {
-        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext)
-      }
-    } else {
-      fatalError()
-    }
-    
-    
-    guard param.minSizes.count == 1 else {
-      fatalError(" need implement ")
-    }
-    
-//    let n = 1
-//    let h = param.output.dim[1]
-//    let w = param.output.dim[2]
-//    let c = param.output.dim[3] * param.output.dim[0]
-//
-//    param.output.dim = Dim.init(inDim: [n, h, w, c])
-//    param.output.transpose = [0, 1, 2, 3]
-    
-    let imageWidth = Float32(param.inputImage.padToFourDim[3])
-    let imageHeight = Float32(param.inputImage.padToFourDim[2])
-    
-    let featureWidth = param.input.padToFourDim[3]
-    let featureHeight = param.input.padToFourDim[2]
-    
-    if param.stepW == 0 || param.stepH == 0 {
-      param.stepW = Float32(imageWidth) / Float32(featureWidth)
-      param.stepH = Float32(imageHeight) / Float32(featureHeight)
-    }
-    
-    var outputAspectRatior: [Float32] = []
-    outputAspectRatior.append(1.0)
-    
-    let epsilon = 1e-6
-    for ar in param.aspectRatios {
-      var alreadyExist = false
-      for outputAr in outputAspectRatior {
-        if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
-          alreadyExist = true
-          break
+    var metalParam: PriorBoxMetalParam!
+    
+    required init(device: MTLDevice, param: PriorBoxParam<P>, initContext: InitContext) {
+        
+        let originDim = param.output.tensorDim;
+        
+        param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+        param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+        
+        param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
+        param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision)
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.min_max_aspect_ratios_order {
+                super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prior_box", initContext: initContext)
+            }
+            
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.min_max_aspect_ratios_order {
+                super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext)
+            }
+        } else {
+            fatalError()
         }
-      }
-      
-      if !alreadyExist {
-        outputAspectRatior.append(ar)
-      }
-      if param.flip {
-        outputAspectRatior.append(1.0 / ar)
-      }
-    }
-    
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
-      float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
-      param.newAspectRatios = buffer
-
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
-      param.newAspectRatios = buffer
-    } else {
-      fatalError()
+        
+        
+        guard param.minSizes.count == 1 else {
+            fatalError(" need implement ")
+        }
+        
+        //    let n = 1
+        //    let h = param.output.dim[1]
+        //    let w = param.output.dim[2]
+        //    let c = param.output.dim[3] * param.output.dim[0]
+        //
+        //    param.output.dim = Dim.init(inDim: [n, h, w, c])
+        //    param.output.transpose = [0, 1, 2, 3]
+        
+        let imageWidth = Float32(param.inputImage.padToFourDim[3])
+        let imageHeight = Float32(param.inputImage.padToFourDim[2])
+        
+        let featureWidth = param.input.padToFourDim[3]
+        let featureHeight = param.input.padToFourDim[2]
+        
+        if param.stepW == 0 || param.stepH == 0 {
+            param.stepW = Float32(imageWidth) / Float32(featureWidth)
+            param.stepH = Float32(imageHeight) / Float32(featureHeight)
+        }
+        
+        var outputAspectRatior: [Float32] = []
+        outputAspectRatior.append(1.0)
+        
+        let epsilon = 1e-6
+        for ar in param.aspectRatios {
+            var alreadyExist = false
+            for outputAr in outputAspectRatior {
+                if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
+                    alreadyExist = true
+                    break
+                }
+            }
+            
+            if !alreadyExist {
+                outputAspectRatior.append(ar)
+            }
+            if param.flip {
+                outputAspectRatior.append(1.0 / ar)
+            }
+        }
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
+            float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
+            param.newAspectRatios = buffer
+            
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
+            param.newAspectRatios = buffer
+        } else {
+            fatalError()
+        }
+        
+        let aspectRatiosSize = uint(outputAspectRatior.count)
+        
+        let maxSizeSize: uint = uint(param.maxSizes.count)
+        let minSizeSize: uint = uint(param.minSizes.count)
+        
+        let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
+        
+        let minSize = param.minSizes.last ?? 0.0
+        let maxSize = param.maxSizes.last ?? 0.0
+        
+        metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
+        
     }
     
-    let aspectRatiosSize = uint(outputAspectRatior.count)
-    
-    let maxSizeSize: uint = uint(param.maxSizes.count)
-    let minSizeSize: uint = uint(param.minSizes.count)
-    
-    let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
-    
-    let minSize = param.minSizes.last ?? 0.0
-    let maxSize = param.maxSizes.last ?? 0.0
-    
-    metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
-    
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setTexture(param.outputVariances.metalTexture, index: 2)
+        
+        encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
+        
+        encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setTexture(param.outputVariances.metalTexture, index: 2)
-    
-    encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
-    
-    encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
-    
-    encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
index 0bde0623ef..06ff7d3990 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
@@ -15,23 +15,23 @@
 import Foundation
 
 class ReluKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) {
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "relu", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "relu_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) {
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "relu", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "relu_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
index f14db86a3a..954eff9a56 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
@@ -15,83 +15,83 @@
 import Foundation
 
 struct ReshapeMetalParam {
-  var idim: (Int32, Int32, Int32, Int32)
-  var itrans: (Int32, Int32, Int32, Int32)
-  var odim: (Int32, Int32, Int32, Int32)
-  var otrans: (Int32, Int32, Int32, Int32)
+    var idim: (Int32, Int32, Int32, Int32)
+    var itrans: (Int32, Int32, Int32, Int32)
+    var odim: (Int32, Int32, Int32, Int32)
+    var otrans: (Int32, Int32, Int32, Int32)
 }
 
 struct ReshapeTestParam: TestParam {
-  let inputTexture: MTLTexture
-  let outputTexture: MTLTexture
-  let param: ReshapeMetalParam
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    let param: ReshapeMetalParam
 }
 
 class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
-  
-  var metalParam: ReshapeMetalParam
-  
-  required init(device: MTLDevice, param: ReshapeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    var id: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.input.tensorDim.cout() {
-      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    
+    var metalParam: ReshapeMetalParam
+    
+    required init(device: MTLDevice, param: ReshapeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        var id: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.input.tensorDim.cout() {
+            id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+        }
+        let it: [Int32] = param.input.transpose.map { Int32($0) }
+        var od: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.output.tensorDim.cout() {
+            od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+        }
+        let ot: [Int32] = param.output.transpose.map { Int32($0) }
+        metalParam = ReshapeMetalParam.init(
+            idim: (id[0], id[1], id[2], id[3]),
+            itrans: (it[0], it[1], it[2], it[3]),
+            odim: (od[0], od[1], od[2], od[3]),
+            otrans: (ot[0], ot[1], ot[2], ot[3])
+        )
+        let irank = param.input.tensorDim.cout()
+        let orank = param.output.tensorDim.cout()
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    let it: [Int32] = param.input.transpose.map { Int32($0) }
-    var od: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.output.tensorDim.cout() {
-      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    
+    required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) {
+        metalParam = ReshapeMetalParam.init(
+            idim: (0, 0, 0, 0),
+            itrans: (0, 0, 0, 0),
+            odim: (0, 0, 0, 0),
+            otrans: (0, 0, 0, 0)
+        )
+        super.init(device: device, inFunctionName: "reshape", initContext: initContext)
     }
-    let ot: [Int32] = param.output.transpose.map { Int32($0) }
-    metalParam = ReshapeMetalParam.init(
-      idim: (id[0], id[1], id[2], id[3]),
-      itrans: (it[0], it[1], it[2], it[3]),
-      odim: (od[0], od[1], od[2], od[3]),
-      otrans: (ot[0], ot[1], ot[2], ot[3])
-    )
-    let irank = param.input.tensorDim.cout()
-    let orank = param.output.tensorDim.cout()
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-  }
-  
-  required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) {
-    metalParam = ReshapeMetalParam.init(
-    idim: (0, 0, 0, 0),
-    itrans: (0, 0, 0, 0),
-    odim: (0, 0, 0, 0),
-    otrans: (0, 0, 0, 0)
-    )
-    super.init(device: device, inFunctionName: "reshape", initContext: initContext)
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
-    }
-
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-
-    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-//  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
-//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-//      fatalError()
-//    }
-//    encoder.setTexture(testParam.inputTexture, index: 0)
-//    encoder.setTexture(testParam.outputTexture, index: 1)
-//    var pm: ReshapeMetalParam = testParam.param
-//    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-//    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
-//    encoder.endEncoding()
-//  }
+    
+    //  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+    //    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+    //      fatalError()
+    //    }
+    //    encoder.setTexture(testParam.inputTexture, index: 0)
+    //    encoder.setTexture(testParam.outputTexture, index: 1)
+    //    var pm: ReshapeMetalParam = testParam.param
+    //    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    //    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+    //    encoder.endEncoding()
+    //  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
index a007196b67..7e9105ae57 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
@@ -15,37 +15,37 @@
 import Foundation
 
 struct ResizeBilinearMetalParam {
-  var ratio_h: Float32
-  var ratio_w: Float32
+    var ratio_h: Float32
+    var ratio_w: Float32
 }
 
 class ResizeBilinearKernel<P: PrecisionType>: Kernel, Computable{
-  required init(device: MTLDevice, param: ResizeBilinearParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext)
-    } else {
-      fatalError()
+    required init(device: MTLDevice, param: ResizeBilinearParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
+        let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
+        var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+        encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
-    let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
-    var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-
-  
+    
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
index 2afee5607d..4a6a9a3ee4 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
@@ -15,14 +15,14 @@
 import Foundation
 
 class ScaleKernel: CusomKernel {
-  init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
-    } else {
-      fatalError(" unsupport ")
+    init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
+        } else {
+            fatalError(" unsupport ")
+        }
     }
-  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
index dfec8f9adf..1d2b80cae4 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
@@ -19,24 +19,24 @@ struct ShapeMetalParam {
 }
 
 class ShapeKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
-//    print("shape compute")
-//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-//      throw PaddleMobileError.predictError(message: " encode is nil")
-//    }
-//    encoder.setTexture(param.output.metalTexture, index: 0)
-//    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: ShapeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "shape", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "shape_half", initContext: initContext)
-    } else {
-      fatalError()
+    func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
+        //    print("shape compute")
+        //    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+        //      throw PaddleMobileError.predictError(message: " encode is nil")
+        //    }
+        //    encoder.setTexture(param.output.metalTexture, index: 0)
+        //    encoder.endEncoding()
     }
-  }
-  
+    
+    required init(device: MTLDevice, param: ShapeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "shape", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "shape_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
index 1eac43484d..b4f3281425 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
@@ -15,37 +15,37 @@
 import Foundation
 
 struct SoftmaxMetalParam {
-  let N: Int32
-  let K: Int32
+    let N: Int32
+    let K: Int32
 }
 
 class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
-  
-  var metalParam: SoftmaxMetalParam
-  required init(device: MTLDevice, param: SoftmaxParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    metalParam = SoftmaxMetalParam.init(
-      N: Int32(param.input.tensorDim[0]),
-      K: Int32(param.input.tensorDim[1])
-    )
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "softmax_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "softmax_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    var metalParam: SoftmaxMetalParam
+    required init(device: MTLDevice, param: SoftmaxParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        metalParam = SoftmaxMetalParam.init(
+            N: Int32(param.input.tensorDim[0]),
+            K: Int32(param.input.tensorDim[1])
+        )
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "softmax_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "softmax_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
-
-  func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
index 8b07a87406..d15e372962 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
@@ -15,79 +15,79 @@
 import Foundation
 
 struct SplitMetalParam {
-  var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
-  var axis: Int32 = 0
-  var offset: Int32 = 0
-  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-  var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+    var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+    var axis: Int32 = 0
+    var offset: Int32 = 0
+    var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
 }
 
 class SplitKernel<P: PrecisionType>: Kernel, Computable{
-  var smp: SplitMetalParam
-  func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    var smp: SplitMetalParam
+    func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        for i in 0..<param.outputList.count {
+            encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+        }
+        encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    for i in 0..<param.outputList.count {
-      encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+    
+    required init(device: MTLDevice, param: SplitParam<P>, initContext: InitContext) {
+        //     param.output.initTexture(device: device, computePrecision: computePrecision)
+        let num = param.outputList.count
+        let rank = param.input.tensorDim.cout()
+        assert(num >= 2 && num <= 4)
+        for output in param.outputList {
+            output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        }
+        smp = SplitMetalParam.init()
+        smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
+        smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
+        for i in 0..<4 {
+            if param.input.transpose[i] == smp.axis {
+                smp.axis = Int32(i)
+                break
+            }
+        }
+        smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
+        var vdim: [Int32] = [0, 0, 0, 0]
+        for i in 0..<num {
+            vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
+        }
+        smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
+        var v = "normal"
+        if rank == 4 {
+            if smp.axis == 1 {
+                v = "y"
+            } else if smp.axis == 2 {
+                v = "x"
+            }
+        } else if rank == 3 {
+            if smp.axis == 2 {
+                v = "y"
+            } else if smp.axis == 3 {
+                v = "x"
+            }
+        } else if rank == 2 {
+            if smp.axis == 2 {
+                v = "y"
+            }
+        }
+        if v == "normal" {
+            fatalError("split unsupported")
+        }
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: SplitParam<P>, initContext: InitContext) {
-    //     param.output.initTexture(device: device, computePrecision: computePrecision)
-    let num = param.outputList.count
-    let rank = param.input.tensorDim.cout()
-    assert(num >= 2 && num <= 4)
-    for output in param.outputList {
-      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    }
-    smp = SplitMetalParam.init()
-    smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
-    smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
-    for i in 0..<4 {
-      if param.input.transpose[i] == smp.axis {
-        smp.axis = Int32(i)
-        break
-      }
-    }
-    smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
-    var vdim: [Int32] = [0, 0, 0, 0]
-    for i in 0..<num {
-      vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
-    }
-    smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
-    var v = "normal"
-    if rank == 4 {
-      if smp.axis == 1 {
-        v = "y"
-      } else if smp.axis == 2 {
-        v = "x"
-      }
-    } else if rank == 3 {
-      if smp.axis == 2 {
-        v = "y"
-      } else if smp.axis == 3 {
-        v = "x"
-      }
-    } else if rank == 2 {
-      if smp.axis == 2 {
-        v = "y"
-      }
-    }
-    if v == "normal" {
-      fatalError("split unsupported")
-    }
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
index fd3ba24776..58b3db8d86 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -17,31 +17,31 @@ import MetalKit
 import CoreMedia
 
 struct Texture2DTo2DArrayParam {
-  let input: MTLTexture
-  let output: MTLTexture
-  let expectDim: Dim
+    let input: MTLTexture
+    let output: MTLTexture
+    let expectDim: Dim
 }
 
 class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    encoder.setTexture(param.input.mtlTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: FeedParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext)
-    } else {
-      fatalError()
+    func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.mtlTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
+        encoder.endEncoding()
     }
     
-  }
+    required init(device: MTLDevice, param: FeedParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext)
+        } else {
+            fatalError()
+        }
+        
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
index e1490052e7..92947dc278 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
@@ -15,65 +15,65 @@
 import Foundation
 
 struct TransposeMetalParam {
-  var iC: Int32 = 0
-  var oC: Int32 = 0
-  var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var iC: Int32 = 0
+    var oC: Int32 = 0
+    var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
 }
 
 class TransposeKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: TransposeMetalParam = TransposeMetalParam.init()
-  required init(device: MTLDevice, param: TransposeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    let rank = param.input.tensorDim.cout()
-    var axis: [Int] = [0, 1, 2, 3]
-    for i in 0..<param.axis.count {
-      axis[4-rank+i] = 4 - rank + Int(param.axis[i])
-    }
-
-    var naxis: [Int] = [0, 0, 0, 0]
-    for i in 0..<4 {
-      for j in 0..<4 {
-        if param.input.transpose[j] == axis[i] {
-          naxis[i] = j
-          break
+    var metalParam: TransposeMetalParam = TransposeMetalParam.init()
+    required init(device: MTLDevice, param: TransposeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        let rank = param.input.tensorDim.cout()
+        var axis: [Int] = [0, 1, 2, 3]
+        for i in 0..<param.axis.count {
+            axis[4-rank+i] = 4 - rank + Int(param.axis[i])
         }
-      }
-    }
-    metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
-    metalParam.oC = Int32(param.output.dim[3])
-    metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
-    var kernelFunc = "transpose_undefined"
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.input.transpose == axis {
-        kernelFunc = "transpose_copy_half"
-      } else {
-        kernelFunc = "transpose_\(rank)_half"
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.input.transpose == axis {
-        kernelFunc = "transpose_copy_float"
-      } else {
-        kernelFunc = "transpose_\(rank)_float"
-      }
-    } else {
-      fatalError()
+        
+        var naxis: [Int] = [0, 0, 0, 0]
+        for i in 0..<4 {
+            for j in 0..<4 {
+                if param.input.transpose[j] == axis[i] {
+                    naxis[i] = j
+                    break
+                }
+            }
+        }
+        metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
+        metalParam.oC = Int32(param.output.dim[3])
+        metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
+        var kernelFunc = "transpose_undefined"
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.input.transpose == axis {
+                kernelFunc = "transpose_copy_half"
+            } else {
+                kernelFunc = "transpose_\(rank)_half"
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.input.transpose == axis {
+                kernelFunc = "transpose_copy_float"
+            } else {
+                kernelFunc = "transpose_\(rank)_float"
+            }
+        } else {
+            fatalError()
+        }
+        print("===========>", kernelFunc)
+        print(metalParam)
+        super.init(device: device, inFunctionName: kernelFunc, initContext: initContext)
     }
-    print("===========>", kernelFunc)
-    print(metalParam)
-    super.init(device: device, inFunctionName: kernelFunc, initContext: initContext)
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-  
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-
+    
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
index 6d2e46b649..b438b3c46c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
@@ -15,57 +15,57 @@
 import Foundation
 
 class MulticlassNMSParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
-      bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
-      output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      
-      middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim)
-      
-      bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
+            bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
+            output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            
+            middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim)
+            
+            bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  var bboxOutput: FetchHolder
-  var middleOutput: FetchHolder
-  let scores: Texture
-  let bboxes: Texture
-  var output: Texture
+    var bboxOutput: FetchHolder
+    var middleOutput: FetchHolder
+    let scores: Texture
+    let bboxes: Texture
+    var output: Texture
 }
 
 class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
-
-  func inputVariant() -> [String : [MTLBuffer]] {
-    guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else {
-      fatalError()
+    
+    func inputVariant() -> [String : [MTLBuffer]] {
+        guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else {
+            fatalError()
+        }
+        return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]]
     }
-    return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]]
-  }
-  
-  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let _ {
-      fatalError()
+    
+    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let _ {
+            fatalError()
+        }
+    }
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
+    }
+    
+    typealias OpType =  MulticlassNMSOp<P>
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        
+    }
+    
+    func delogOutput() {
+        print(" nms - output: ")
+        print(para.bboxes.metalTexture.float32Array().strideArray())
     }
-  }
-  
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  typealias OpType =  MulticlassNMSOp<P>
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-
-  }
-  
-  func delogOutput() {
-    print(" nms - output: ")
-    print(para.bboxes.metalTexture.float32Array().strideArray())
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
index e57c8f48e3..8b212f3b1d 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
@@ -15,60 +15,60 @@
 import Foundation
 
 class PoolParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
-      ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
-      stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
-      globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
-      assert(input.transpose == [0, 2, 3, 1])
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
+            ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
+            stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
+            globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
+            assert(input.transpose == [0, 2, 3, 1])
+        } catch let error {
+            throw error
+        }
+        //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
     }
-    //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
-  }
-  let input: Texture
-  var output: Texture
-  var ksize: [Int32]
-  var stride: [Int32]
-  var padding: [Int32]
-  var poolType: String
-  var ceilMode: Bool
-  var globalPooling: Bool
+    let input: Texture
+    var output: Texture
+    var ksize: [Int32]
+    var stride: [Int32]
+    var padding: [Int32]
+    var poolType: String
+    var ceilMode: Bool
+    var globalPooling: Bool
 }
 
 class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = PoolOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = PoolOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-
     
-//    print("pool2d delog")
-//    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
-//    print(para.ksize)
-//    print(para.stride)
-//    print(para.padding)
-//    print(para.poolType)
-//    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
-  }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        
+        
+        //    print("pool2d delog")
+        //    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
+        //    print(para.ksize)
+        //    print(para.stride)
+        //    print(para.padding)
+        //    print(para.poolType)
+        //    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
index b7150c2fea..09a6b027e3 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
@@ -15,51 +15,51 @@
 import Foundation
 
 class PreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let mode: String
-  let alpha: Tensor<P>
-  let input: Texture
-  var output: Texture
+    let mode: String
+    let alpha: Tensor<P>
+    let input: Texture
+    var output: Texture
 }
 
 class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = PreluOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = PreluOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) input: ")
-    print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
     
-    print(" \(type) Alpha: ")
-    let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) input: ")
+        print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
+        
+        print(" \(type) Alpha: ")
+        let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
+        
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+    }
     
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
-  
-//    print("softmax delog")
-//    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
-//    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
+    //    print("softmax delog")
+    //    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
+    //    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
index bff7c9870a..80774f22a9 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
@@ -15,109 +15,109 @@
 import Foundation
 
 class PriorBoxParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
-    } catch _ {
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
+        } catch _ {
+        }
+        
+        do {
+            input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
+            inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
+            outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
+            minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
+            maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
+            aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
+            variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
+            flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
+            clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
+            stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
+            stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
+            offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
     
-    do {
-      input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
-      inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
-      outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
-      minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
-      maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
-      aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
-      variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
-      flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
-      clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
-      stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
-      stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
-      offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
-    }
-  }
-  
-  var min_max_aspect_ratios_order: Bool = false
-  let minSizes: [Float32]
-  let maxSizes: [Float32]
-  let aspectRatios: [Float32]
-  var newAspectRatios: MTLBuffer?
-  let variances: [Float32]
-  let flip: Bool
-  let clip: Bool
-  var stepW: Float32
-  var stepH: Float32
-  let offset: Float32
-  
-  let input: Texture
-  let inputImage: Texture
-  var output: Texture
-  let outputVariances: Texture
+    var min_max_aspect_ratios_order: Bool = false
+    let minSizes: [Float32]
+    let maxSizes: [Float32]
+    let aspectRatios: [Float32]
+    var newAspectRatios: MTLBuffer?
+    let variances: [Float32]
+    let flip: Bool
+    let clip: Bool
+    var stepW: Float32
+    var stepH: Float32
+    let offset: Float32
+    
+    let input: Texture
+    let inputImage: Texture
+    var output: Texture
+    let outputVariances: Texture
 }
 
 class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = PriorBoxOp<P>
-
-  func inferShape() {
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
-    }
-  }
-  
-  func delogOutput() {
-
-    print(" \(type) output: ")
-    // output
-//    let outputArray = para.output.metalTexture.float32Array()
-//    print(outputArray.strideArray())
-//    let device = para.input.metalTexture!.device
-//    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
-//    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
-//    print("boxes: ")
-//    print(boxes.strideArray())
-//    print("variances: ")
-//    print(variances.strideArray())
-    // output
-    print(" \(type) output: ")
     
-    let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3]))
-    print(" dim: \(para.output.dim)")
-    print(box.strideArray())
-//    print((0..<box.count).map { (index: $0, value: box[$0])})
-//    print(para.output.realNHWC().strideArray())
+    typealias OpType = PriorBoxOp<P>
     
-//    let padToFourDim = para.output.padToFourDim
-//    if para.output.transpose == [0, 1, 2, 3] {
-//      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
-//      print(outputArray.strideArray())
-//    } else if para.output.transpose == [0, 2, 3, 1] {
-//      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
-//    } else {
-//      print(" not implement")
-//    }
-    
-//    writeToLibrary(fileName: "box_out", array: outputArray)
-    
-    // output variance
-//    let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in
-//      return o
-//    }
-//
-//    print(" output variance: \(outputVarianceArray)")
+    func inferShape() {
+    }
     
-//    writeToLibrary(fileName: "variance_out", array: outputVarianceArray)
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
     
-  }
+    func delogOutput() {
+        
+        print(" \(type) output: ")
+        // output
+        //    let outputArray = para.output.metalTexture.float32Array()
+        //    print(outputArray.strideArray())
+        //    let device = para.input.metalTexture!.device
+        //    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
+        //    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
+        //    print("boxes: ")
+        //    print(boxes.strideArray())
+        //    print("variances: ")
+        //    print(variances.strideArray())
+        // output
+        print(" \(type) output: ")
+        
+        let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3]))
+        print(" dim: \(para.output.dim)")
+        print(box.strideArray())
+        //    print((0..<box.count).map { (index: $0, value: box[$0])})
+        //    print(para.output.realNHWC().strideArray())
+        
+        //    let padToFourDim = para.output.padToFourDim
+        //    if para.output.transpose == [0, 1, 2, 3] {
+        //      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
+        //      print(outputArray.strideArray())
+        //    } else if para.output.transpose == [0, 2, 3, 1] {
+        //      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
+        //    } else {
+        //      print(" not implement")
+        //    }
+        
+        //    writeToLibrary(fileName: "box_out", array: outputArray)
+        
+        // output variance
+        //    let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in
+        //      return o
+        //    }
+        //
+        //    print(" output variance: \(outputVarianceArray)")
+        
+        //    writeToLibrary(fileName: "variance_out", array: outputVarianceArray)
+        
+    }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
index ef10908106..a286114b3f 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
@@ -16,44 +16,44 @@
 import Foundation
 
 class ReluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
+    let input: Texture
+    var output: Texture
 }
 
 class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ReluOp<P>
-  
-  func inferShape() {
-    para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ReluOp<P>
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture)
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-//    let device = para.output.metalTexture!.device
-//    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-//    print(outputArray.strideArray())
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture)
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        //    let device = para.output.metalTexture!.device
+        //    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        //    print(outputArray.strideArray())
+    }
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
index e40eae02d0..417344f1da 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
@@ -16,63 +16,63 @@ import Foundation
 import Metal
 
 class ReshapeParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
-        
-      var s: [Int] = shape.map { Int($0) }
-      
-      var di = -1
-      var ml = 1
-      for i in 0..<s.count {
-        if s[i] == -1 {
-          di = i
-          continue
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
+            
+            var s: [Int] = shape.map { Int($0) }
+            
+            var di = -1
+            var ml = 1
+            for i in 0..<s.count {
+                if s[i] == -1 {
+                    di = i
+                    continue
+                }
+                ml *= s[i]
+            }
+            if di >= 0 {
+                s[di] = input.dim.numel() / ml
+            }
+            output.tensorDim = Dim.init(inDim: s)
+            var dim: [Int] = [1, 1, 1, 1]
+            for i in 0..<s.count {
+                dim[4-s.count+i] = s[i]
+            }
+            output.padToFourDim = Dim.init(inDim: dim)
+            output.dim = output.padToFourDim
+        } catch let error {
+            throw error
         }
-        ml *= s[i]
-      }
-      if di >= 0 {
-        s[di] = input.dim.numel() / ml
-      }
-      output.tensorDim = Dim.init(inDim: s)
-      var dim: [Int] = [1, 1, 1, 1]
-      for i in 0..<s.count {
-        dim[4-s.count+i] = s[i]
-      }
-      output.padToFourDim = Dim.init(inDim: dim)
-      output.dim = output.padToFourDim
-    } catch let error {
-      throw error
     }
-  }
-  let input: Texture
-  let shape: [Int32]
-  var output: Texture
+    let input: Texture
+    let shape: [Int32]
+    var output: Texture
 }
 
 class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ReshapeOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ReshapeOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    func delogOutput() {
+        print("reshape delog")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
+        //    print(outputArray)
     }
-  }
-  func delogOutput() {
-    print("reshape delog")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-//    print(outputArray)
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
index 980bb734a7..e71a62b682 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
@@ -15,50 +15,44 @@
 import Foundation
 
 class ResizeBilinearParam<P: PrecisionType>: OpParam {
-  typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
-//      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
-//        fatalError()
-//      }
-      output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
-      out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
+            //      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+            //        fatalError()
+            //      }
+            output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+            out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let out_h: Int32
-  let out_w: Int32
+    let input: Texture
+    var output: Texture
+    let out_h: Int32
+    let out_w: Int32
 }
 
 class ResizeBilinearOp<P: PrecisionType>: Operator<ResizeBilinearKernel<P>, ResizeBilinearParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ResizeBilinearOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ResizeBilinearOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-  }
-  
+    
 }
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
index c13c3864e4..fd358a67ae 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
@@ -15,39 +15,39 @@
 import Foundation
 
 class ShapeParam<P: PrecisionType>: OpParam {
- // typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
+    // typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  var output: Texture
-  let input: Texture
+    var output: Texture
+    let input: Texture
 }
 
 class ShapeOp<P: PrecisionType>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ShapeOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ShapeOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+    }
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
index 2b2455eaa6..f13bf20195 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
@@ -16,48 +16,48 @@ import Foundation
 import Metal
 
 class SoftmaxParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      
-      //assert(input.tensorDim.dims.count == 2)
-      //assert(input.transpose == [0, 1, 2, 3])
-      
-      output.dim = input.dim
-      output.tensorDim = input.tensorDim
-      output.padToFourDim = input.padToFourDim
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            
+            //assert(input.tensorDim.dims.count == 2)
+            //assert(input.transpose == [0, 1, 2, 3])
+            
+            output.dim = input.dim
+            output.tensorDim = input.tensorDim
+            output.padToFourDim = input.padToFourDim
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
+    let input: Texture
+    var output: Texture
 }
 
 class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, Runable, Creator, InferShaperable{
-  typealias OpType = SoftmaxOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = SoftmaxOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print("softmax delog")
-    print(para.input)
     
-    print(para.output)
-    let padToFourDim = para.output.padToFourDim
-    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-    print(outputArray.strideArray())
-  }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print("softmax delog")
+        print(para.input)
+        
+        print(para.output)
+        let padToFourDim = para.output.padToFourDim
+        let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+        print(outputArray.strideArray())
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
index 4d9933f392..4d5cb9b0be 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
@@ -15,63 +15,63 @@
 import Foundation
 
 class SplitParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = Texture.init(device: input.metalTexture!.device, inDim: input.dim)
-      axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
-      sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
-      if axis < 0 {
-        axis = input.tensorDim.cout() + axis
-      }
-      guard let outlist = opDesc.outputs["Out"] else {
-        fatalError()
-      }
-      for out in outlist {
-        guard let variant = inScope[out], let v = variant as? Texture else {
-          fatalError()
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = Texture.init(device: input.metalTexture!.device, inDim: input.dim)
+            axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
+            if axis < 0 {
+                axis = input.tensorDim.cout() + axis
+            }
+            guard let outlist = opDesc.outputs["Out"] else {
+                fatalError()
+            }
+            for out in outlist {
+                guard let variant = inScope[out], let v = variant as? Texture else {
+                    fatalError()
+                }
+                outputList.append(v)
+                sections.append(Int32(v.tensorDim.dims[axis]))
+            }
+        } catch let error {
+            throw error
         }
-        outputList.append(v)
-        sections.append(Int32(v.tensorDim.dims[axis]))
-      }
-    } catch let error {
-      throw error
     }
-  }
-  
-  var axis: Int
-  let input: Texture
-  var output: Texture
-  var outputList: [Texture] = []
-  var sections: [Int32] = []
+    
+    var axis: Int
+    let input: Texture
+    var output: Texture
+    var outputList: [Texture] = []
+    var sections: [Int32] = []
 }
 
 class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = SplitOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = SplitOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.input.metalTexture!.device
-    for out in para.outputList {
-      let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
-      print(arr.strideArray())
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.input.metalTexture!.device
+        for out in para.outputList {
+            let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
+            print(arr.strideArray())
+        }
     }
-  }
-  
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
index 064955fcac..c05c080667 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
@@ -16,43 +16,43 @@ import Foundation
 import Metal
 
 class TransposeParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let axis: [Int32]
+    let input: Texture
+    var output: Texture
+    let axis: [Int32]
 }
 
 class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = TransposeOp<P>
-
-  func inferShape() {
-    //para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = TransposeOp<P>
+    
+    func inferShape() {
+        //para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
index b021b09008..27ed620c24 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
@@ -45,13 +45,13 @@ public class PMBlockDesc {
 }
 
 extension PMBlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
-  public var description: String {
+    public var description: String {
         var str = ""
         
         for i in 0..<ops.count {
-          str += " op \(i): "
-          let op = ops[i]
-          str += op.description
+            str += " op \(i): "
+            let op = ops[i]
+            str += op.description
         }
         
         for varDesc in vars {
@@ -61,7 +61,7 @@ extension PMBlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
         return str
     }
     
-  public var debugDescription: String {
+    public var debugDescription: String {
         return description
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
index 663677150e..51a9e6be2f 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
@@ -15,12 +15,12 @@
 import Foundation
 
 class PMOpDesc {
-  let inputs: [String : [String]]
-  var paraInputs: [String : [String]]
-  var outputs: [String : [String]]
-  let unusedOutputs: [String : [String]]
-  var attrs: [String : Attr] = [:]
-  var type: String
+    let inputs: [String : [String]]
+    var paraInputs: [String : [String]]
+    var outputs: [String : [String]]
+    let unusedOutputs: [String : [String]]
+    var attrs: [String : Attr] = [:]
+    var type: String
     init(protoOpDesc: OpDesc) {
         type = protoOpDesc.type
         let creator = { (vars: [OpDesc_Var], canAdd: (String) -> Bool) -> [String : [String]] in
@@ -58,24 +58,24 @@ class PMOpDesc {
 }
 
 extension PMOpDesc: CustomStringConvertible, CustomDebugStringConvertible {
-  var description: String {
-    var str = ""
-    str += "op type: \(type): \n"
-    str += "    op inputs: \n"
-    str += "        \(inputs) \n"
-    str += "    op para inputs: \n"
-    str += "        \(paraInputs) \n"
-    str += "    op para outputs: \n"
-    str += "        \(outputs) \n"
-    str += "    op attrs: \n"
-    str += "        \(attrs) \n"
+    var description: String {
+        var str = ""
+        str += "op type: \(type): \n"
+        str += "    op inputs: \n"
+        str += "        \(inputs) \n"
+        str += "    op para inputs: \n"
+        str += "        \(paraInputs) \n"
+        str += "    op para outputs: \n"
+        str += "        \(outputs) \n"
+        str += "    op attrs: \n"
+        str += "        \(attrs) \n"
+        
+        return str
+    }
+    
+    var debugDescription: String {
+        return description
+    }
+    
     
-    return str
-  }
-  
-  var debugDescription: String {
-    return description
-  }
-  
-  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
index 130e6f49fb..e97f448e29 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
@@ -79,7 +79,7 @@ public class PMVarDesc {
 }
 
 extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible {
-  public var description: String {
+    public var description: String {
         var str = ""
         str += "var name \(name): \n"
         if let inTensorDesc = tensorDesc {
@@ -93,7 +93,7 @@ extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible {
         return str
     }
     
-  public var debugDescription: String {
+    public var debugDescription: String {
         return description
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
index dcb065de3d..e4248b6409 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
@@ -15,286 +15,286 @@
 import Foundation
 
 precedencegroup ChainNode {
-  associativity: left
-  higherThan: MultiplicationPrecedence
+    associativity: left
+    higherThan: MultiplicationPrecedence
 }
 
 infix operator --> : ChainNode
 
 class Node {
-  var inputs: [Node] = []
-  var outputs: [Node] = []
-  var type: String
-  var opDesc: PMOpDesc?
-  init(inOpDesc: PMOpDesc) {
-    type = inOpDesc.type
-    opDesc = inOpDesc
-  }
-  
-  init(inType: String) {
-    type = inType
-  }
-  
-  subscript(index: Int) -> [Node] {
-    var nodes: [Node] = []
-    getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
-    return nodes
-  }
-  
-  func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
-    if index == nowIndex {
-      nodes.append(self)
+    var inputs: [Node] = []
+    var outputs: [Node] = []
+    var type: String
+    var opDesc: PMOpDesc?
+    init(inOpDesc: PMOpDesc) {
+        type = inOpDesc.type
+        opDesc = inOpDesc
     }
     
-    for output in outputs {
-      output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+    init(inType: String) {
+        type = inType
     }
-  }
-  
-  static func -->(lNode: Node, rNode: Node) -> Node {
-    lNode.outputs.append(rNode)
-    rNode.inputs.append(lNode)
-    return rNode
-  }
-  
-  func depth(begin: UInt = 1) -> UInt {
-    var beginMax: UInt = 1
-    for output in outputs {
-      let subDepth = output.depth(begin: begin + 1)
-      beginMax = max(begin, subDepth)
-    }
-    beginMax = max(begin, beginMax)
-    return beginMax
-  }
-  
-  func to(depth: UInt) -> Node {
-    let beginNode = Node.init(inType: type)
-    beginNode.opDesc = opDesc
-    to(depth: depth - 1, withNode: beginNode)
-    return beginNode
-  }
-  
-  func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
-    let fusionNode = fusion.fusionNode()
-    let change = fusion.change()
-    let inOutputs = outputs
-    outputs.removeAll()
-    opDesc?.outputs.removeAll()
-    for i in 0..<inOutputs.count {
-      inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+    
+    subscript(index: Int) -> [Node] {
+        var nodes: [Node] = []
+        getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
+        return nodes
     }
-    opDesc?.type = fusion.fusionType()
-    type = fusion.fusionType()
-  }
-  
-  private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
-    guard let inOpdesc = opDesc else {
-      fatalError()
+    
+    func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
+        if index == nowIndex {
+            nodes.append(self)
+        }
+        
+        for output in outputs {
+            output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+        }
     }
     
-    for attr in inOpdesc.attrs {
-      beginNode.opDesc?.attrs[attr.key] = attr.value
-      //            print(beginNode.opDesc?.attrs)
+    static func -->(lNode: Node, rNode: Node) -> Node {
+        lNode.outputs.append(rNode)
+        rNode.inputs.append(lNode)
+        return rNode
     }
     
-    for paraInput in inOpdesc.paraInputs {
-      if let inChanges = change[type] {
-        for keyChange in inChanges {
-          if keyChange.from == paraInput.key {
-            beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
-          } else {
-            beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-          }
+    func depth(begin: UInt = 1) -> UInt {
+        var beginMax: UInt = 1
+        for output in outputs {
+            let subDepth = output.depth(begin: begin + 1)
+            beginMax = max(begin, subDepth)
         }
-      } else {
-        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-      }
+        beginMax = max(begin, beginMax)
+        return beginMax
     }
     
-    if matchNode.outputs.count == 0 {
-      beginNode.outputs.append(contentsOf: outputs)
-      beginNode.opDesc?.outputs = inOpdesc.outputs
-      
+    func to(depth: UInt) -> Node {
+        let beginNode = Node.init(inType: type)
+        beginNode.opDesc = opDesc
+        to(depth: depth - 1, withNode: beginNode)
+        return beginNode
     }
-    removedNodes.append(self)
     
-    for i in 0..<matchNode.outputs.count {
-      outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
+    func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
+        let fusionNode = fusion.fusionNode()
+        let change = fusion.change()
+        let inOutputs = outputs
+        outputs.removeAll()
+        opDesc?.outputs.removeAll()
+        for i in 0..<inOutputs.count {
+            inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+        }
+        opDesc?.type = fusion.fusionType()
+        type = fusion.fusionType()
     }
     
-  }
-  
-  private func to(depth: UInt, withNode: Node) {
-    if depth < 1 {
-      return
+    private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
+        guard let inOpdesc = opDesc else {
+            fatalError()
+        }
+        
+        for attr in inOpdesc.attrs {
+            beginNode.opDesc?.attrs[attr.key] = attr.value
+            //            print(beginNode.opDesc?.attrs)
+        }
+        
+        for paraInput in inOpdesc.paraInputs {
+            if let inChanges = change[type] {
+                for keyChange in inChanges {
+                    if keyChange.from == paraInput.key {
+                        beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
+                    } else {
+                        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+                    }
+                }
+            } else {
+                beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+            }
+        }
+        
+        if matchNode.outputs.count == 0 {
+            beginNode.outputs.append(contentsOf: outputs)
+            beginNode.opDesc?.outputs = inOpdesc.outputs
+            
+        }
+        removedNodes.append(self)
+        
+        for i in 0..<matchNode.outputs.count {
+            outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
+        }
+        
     }
     
-    for output in outputs {
-      let node = Node.init(inType: output.type)
-      node.opDesc = output.opDesc
-      withNode.outputs.append(node)
-      output.to(depth: depth - 1, withNode: node)
-    }
-  }
-  
-  func relationship() -> [String : Node]{
-    var map: [String : Node] = [:]
-    relationship(map: &map)
-    return map
-  }
-  
-  private func relationship(map: inout [String : Node]) {
-    guard let inOpDesc = opDesc else {
-      return
+    private func to(depth: UInt, withNode: Node) {
+        if depth < 1 {
+            return
+        }
+        
+        for output in outputs {
+            let node = Node.init(inType: output.type)
+            node.opDesc = output.opDesc
+            withNode.outputs.append(node)
+            output.to(depth: depth - 1, withNode: node)
+        }
     }
     
-    for output in inOpDesc.outputs {
-      for outputKey in output.value {
-        map[outputKey] = self
-      }
+    func relationship() -> [String : Node]{
+        var map: [String : Node] = [:]
+        relationship(map: &map)
+        return map
     }
     
-    for output in outputs {
-      output.relationship(map: &map)
+    private func relationship(map: inout [String : Node]) {
+        guard let inOpDesc = opDesc else {
+            return
+        }
+        
+        for output in inOpDesc.outputs {
+            for outputKey in output.value {
+                map[outputKey] = self
+            }
+        }
+        
+        for output in outputs {
+            output.relationship(map: &map)
+        }
     }
-  }
-  
+    
 }
 
 extension Node: Equatable {
-  static func == (lhs: Node, rhs: Node) -> Bool {
-    if lhs.outputs.count != rhs.outputs.count {
-      return false
-    }
-    
-    if lhs.type != rhs.type {
-      return false
+    static func == (lhs: Node, rhs: Node) -> Bool {
+        if lhs.outputs.count != rhs.outputs.count {
+            return false
+        }
+        
+        if lhs.type != rhs.type {
+            return false
+        }
+        
+        for i in 0..<lhs.outputs.count {
+            if lhs.outputs[i] != rhs.outputs[i] {
+                return false
+            }
+        }
+        return true
     }
     
-    for i in 0..<lhs.outputs.count {
-      if lhs.outputs[i] != rhs.outputs[i] {
-        return false
-      }
-    }
-    return true
-  }
-  
 }
 
 class ProgramOptimize<P: PrecisionType> {
-  // register fusion
-  let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
-//                                  ConvAddAddPreluOp<P>.self,
-                                  ConvAddPreluOp<P>.self,
-                                  ConvAddOp<P>.self,
-                                  ConvBNReluOp<P>.self,
-                                  DwConvBNReluOp<P>.self,
-                                  ElementwiseAddPreluOp<P>.self
-  ]
-  
-  func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc {
+    // register fusion
+    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+                                    //                                  ConvAddAddPreluOp<P>.self,
+        ConvAddPreluOp<P>.self,
+        ConvAddOp<P>.self,
+        ConvBNReluOp<P>.self,
+        DwConvBNReluOp<P>.self,
+        ElementwiseAddPreluOp<P>.self
+    ]
     
-    guard originProgramDesc.blocks.count == 1 else {
-      fatalError(" not support yet")
-    }
-    
-    var mapForNodeChain: [String : Node] = [:]
-    var nodes: [Node] = []
-    var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
-    let block = originProgramDesc.blocks[0]
-    for opDesc in block.ops {
-        print(opDesc.type)
-      guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
-        fatalError()
-      }
-      
-      let node = Node.init(inOpDesc: opDesc)
-      for inputKey in opInputKeys {
-        if let inputs = opDesc.inputs[inputKey] {
-          for input in inputs {
-            if let inputNode = mapForNodeChain[input] {
-              _ = inputNode --> node
-            }
-          }
+    func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc {
+        
+        guard originProgramDesc.blocks.count == 1 else {
+            fatalError(" not support yet")
         }
-      }
-      
-      for outputKey in outputKeys {
-        if let outputs = opDesc.outputs[outputKey] {
-          for output in outputs {
-            mapForNodeChain[output] = node
-          }
-        }
-      }
-      
-      nodes.append(node)
-      
-      if var inNodes = typeMapNodes[opDesc.type] {
-        inNodes.append((node, mapForNodeChain))
-        typeMapNodes[opDesc.type] = inNodes
-      } else {
-        typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
-      }
-    }
-    
-    for fusion in fusionOps {
-      let fusionNode = fusion.fusionNode()
-      let depth = fusionNode.depth()
-      if let toMatchNodes = typeMapNodes[fusionNode.type] {
-        for node in toMatchNodes {
-          
-          let toNode = node.node.to(depth: depth)
-          if toNode == fusionNode {   // match
-            var canFolder = true
-            let relationshipMap = toNode.relationship()
+        
+        var mapForNodeChain: [String : Node] = [:]
+        var nodes: [Node] = []
+        var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
+        let block = originProgramDesc.blocks[0]
+        for opDesc in block.ops {
+            print(opDesc.type)
+            guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
+                fatalError()
+            }
             
-            for toCheck in fusion.needCheck() {
-              //              let nodes = toCheck
-              let checkNodes = toNode[toCheck.0]
-              
-              for checkNode in checkNodes {
-                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
-                for inputToCheck in inputToChecks {
-                  if node.output[inputToCheck] == nil {
-                    if relationshipMap[inputToCheck] == nil {
-                      canFolder = false
+            let node = Node.init(inOpDesc: opDesc)
+            for inputKey in opInputKeys {
+                if let inputs = opDesc.inputs[inputKey] {
+                    for input in inputs {
+                        if let inputNode = mapForNodeChain[input] {
+                            _ = inputNode --> node
+                        }
                     }
-                  }
                 }
-                
-                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
-                for paramInputToCheck in paramInputToChecks {
-                  if node.output[paramInputToCheck] == nil {
-                    if relationshipMap[paramInputToCheck] == nil {
-                      canFolder = false
+            }
+            
+            for outputKey in outputKeys {
+                if let outputs = opDesc.outputs[outputKey] {
+                    for output in outputs {
+                        mapForNodeChain[output] = node
                     }
-                  }
                 }
-              }
             }
             
-            if !canFolder {
-              continue
-            }
+            nodes.append(node)
             
-            var removeNodes: [Node] = []
-            node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
-            for removeNode in removeNodes {
-              nodes.remove(element: removeNode)
+            if var inNodes = typeMapNodes[opDesc.type] {
+                inNodes.append((node, mapForNodeChain))
+                typeMapNodes[opDesc.type] = inNodes
+            } else {
+                typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
             }
-          }
         }
-      }
-    }
-    
-    var ops: [PMOpDesc] = []
-    for node in nodes {
-      ops.append(node.opDesc!)
+        
+        for fusion in fusionOps {
+            let fusionNode = fusion.fusionNode()
+            let depth = fusionNode.depth()
+            if let toMatchNodes = typeMapNodes[fusionNode.type] {
+                for node in toMatchNodes {
+                    
+                    let toNode = node.node.to(depth: depth)
+                    if toNode == fusionNode {   // match
+                        var canFolder = true
+                        let relationshipMap = toNode.relationship()
+                        
+                        for toCheck in fusion.needCheck() {
+                            //              let nodes = toCheck
+                            let checkNodes = toNode[toCheck.0]
+                            
+                            for checkNode in checkNodes {
+                                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
+                                for inputToCheck in inputToChecks {
+                                    if node.output[inputToCheck] == nil {
+                                        if relationshipMap[inputToCheck] == nil {
+                                            canFolder = false
+                                        }
+                                    }
+                                }
+                                
+                                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
+                                for paramInputToCheck in paramInputToChecks {
+                                    if node.output[paramInputToCheck] == nil {
+                                        if relationshipMap[paramInputToCheck] == nil {
+                                            canFolder = false
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        
+                        if !canFolder {
+                            continue
+                        }
+                        
+                        var removeNodes: [Node] = []
+                        node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
+                        for removeNode in removeNodes {
+                            nodes.remove(element: removeNode)
+                        }
+                    }
+                }
+            }
+        }
+        
+        var ops: [PMOpDesc] = []
+        for node in nodes {
+            ops.append(node.opDesc!)
+        }
+        
+        let newProgramDesc = PMProgramDesc.init()
+        let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops)
+        newProgramDesc.blocks.append(newBlock)
+        return newProgramDesc
     }
-    
-    let newProgramDesc = PMProgramDesc.init()
-    let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops)
-    newProgramDesc.blocks.append(newBlock)
-    return newProgramDesc
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
index d73eefd096..478867b08c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
@@ -48,7 +48,7 @@ public class Scope {
         }
         
     }
-
+    
     func clear(){
         vars.removeAll()
     }
-- 
GitLab