diff --git a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
index 4152b9be890fe3101e3137f02686fb6359cb108d..9596c1a535c587897d40cae7c73a4d5b6b442a11 100644
--- a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
@@ -10,37 +10,37 @@ import UIKit
 
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
-
-  var window: UIWindow?
-
-
-  func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
-    // Override point for customization after application launch.
-    return true
-  }
-
-  func applicationWillResignActive(_ application: UIApplication) {
-    // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
-    // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
-  }
-
-  func applicationDidEnterBackground(_ application: UIApplication) {
-    // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
-    // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
-  }
-
-  func applicationWillEnterForeground(_ application: UIApplication) {
-    // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
-  }
-
-  func applicationDidBecomeActive(_ application: UIApplication) {
-    // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
-  }
-
-  func applicationWillTerminate(_ application: UIApplication) {
-    // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
-  }
-
-
+    
+    var window: UIWindow?
+    
+    
+    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
+        // Override point for customization after application launch.
+        return true
+    }
+    
+    func applicationWillResignActive(_ application: UIApplication) {
+        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
+        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
+    }
+    
+    func applicationDidEnterBackground(_ application: UIApplication) {
+        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
+        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
+    }
+    
+    func applicationWillEnterForeground(_ application: UIApplication) {
+        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
+    }
+    
+    func applicationDidBecomeActive(_ application: UIApplication) {
+        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
+    }
+    
+    func applicationWillTerminate(_ application: UIApplication) {
+        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
+    }
+    
+    
 }
 
diff --git a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
index f0902855cc0e40505aaa723c3fac7a1f2301d086..7f26427f2babcf999b81a93c60e6322e0f8d1521 100644
--- a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift
@@ -16,51 +16,51 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet: Net{
-  class MobilenetPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    class MobilenetPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
     }
-  }
-  
-  class PreWords {
-    var contents: [String] = []
-    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-        let string = try! String.init(contentsOfFile: filePath)
-        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+    
+    class PreWords {
+        var contents: [String] = []
+        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+                let string = try! String.init(contentsOfFile: filePath)
+                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
+                }
+            }else{
+                fatalError("no file call \(fileName)")
+            }
+        }
+        subscript(index: Int) -> String {
+            return contents[index]
         }
-      }else{
-        fatalError("no file call \(fileName)")
-      }
     }
-    subscript(index: Int) -> String {
-      return contents[index]
+    
+    let labels = PreWords.init(fileName: "synset")
+    
+    override public func resultStr(res: [ResultHolder]) -> String {
+        let firstRes = res[0]
+        let resPointer = firstRes.result
+        var s: [String] = []
+        (0..<firstRes.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+        }
+        return s.joined(separator: "\n")
     }
-  }
-  
-  let labels = PreWords.init(fileName: "synset")
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    let firstRes = res[0]
-    let resPointer = firstRes.result
-    var s: [String] = []
-    (0..<firstRes.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
-      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    
+    override public init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"
+        preprocessKernel = MobilenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
     }
-    return s.joined(separator: "\n")
-  }
-  
-  override public init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"
-    preprocessKernel = MobilenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
 }
 
diff --git a/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal b/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
index c7db4187c1e27b37260b87ab251b9881251b67da..2da78ec4c18d7f84f9bd5512c9e13e3b56a1c9c1 100644
--- a/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
+++ b/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
@@ -14,13 +14,13 @@ kernel void mobilenet_preprocess(
                                  texture2d<float, access::write> outTexture [[texture(1)]],
                                  uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void mobilenet_preprocess_half(
@@ -28,11 +28,11 @@ kernel void mobilenet_preprocess_half(
                                       texture2d<half, access::write> outTexture [[texture(1)]],
                                       uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
-  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
diff --git a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
index 4e31282f0356bdd3ce4bc2b7ef69e7ad0bd5ef89..a0d69c5c0633b68adf82582e1cef6357137645a5 100644
--- a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
@@ -10,84 +10,84 @@ import UIKit
 import paddle_mobile
 
 class ViewController: UIViewController {
-  @IBOutlet weak var resultTextView: UITextView!
-  @IBOutlet weak var selectImageView: UIImageView!
-  @IBOutlet weak var elapsedTimeLabel: UILabel!
-  var net: MobileNet!
-  var runner: Runner!
-  var toPredictTexture: MTLTexture?
-  
-  override func viewDidLoad() {
-    super.viewDidLoad()
-    GlobalConfig.shared.computePrecision = .Float16
-    net = MobileNet.init(device: MetalHelper.shared.device)
-    runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue)
+    @IBOutlet weak var resultTextView: UITextView!
+    @IBOutlet weak var selectImageView: UIImageView!
+    @IBOutlet weak var elapsedTimeLabel: UILabel!
+    var net: MobileNet!
+    var runner: Runner!
+    var toPredictTexture: MTLTexture?
     
-    if let selectImage = UIImage.init(named: "banana.jpeg") {
-      selectImageView.image = selectImage
-      runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in
-        self?.toPredictTexture = texture
-      }
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        GlobalConfig.shared.computePrecision = .Float16
+        net = MobileNet.init(device: MetalHelper.shared.device)
+        runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue)
+        
+        if let selectImage = UIImage.init(named: "banana.jpeg") {
+            selectImageView.image = selectImage
+            runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in
+                self?.toPredictTexture = texture
+            }
+        }
+        
+    }
+    
+    @IBAction func loadAct(_ sender: Any) {
+        if runner.load() {
+            let resutText = " load success ! "
+            print(resutText)
+            self.resultTextView.text = resutText
+        } else {
+            fatalError(" load error ")
+        }
+    }
+    
+    @IBAction func selectImageAct(_ sender: Any) {
+        let imagePicker = UIImagePickerController()
+        imagePicker.sourceType = .camera
+        imagePicker.delegate = self
+        self.present(imagePicker, animated: true, completion: nil)
     }
     
-  }
-  
-  @IBAction func loadAct(_ sender: Any) {
-    if runner.load() {
-      let resutText = " load success ! "
-      print(resutText)
-      self.resultTextView.text = resutText
-    } else {
-      fatalError(" load error ")
+    @IBAction func clearAct(_ sender: Any) {
+        runner.clear()
     }
-  }
-  
-  @IBAction func selectImageAct(_ sender: Any) {
-    let imagePicker = UIImagePickerController()
-    imagePicker.sourceType = .camera
-    imagePicker.delegate = self
-    self.present(imagePicker, animated: true, completion: nil)
-  }
-  
-  @IBAction func clearAct(_ sender: Any) {
-    runner.clear()
-  }
-  
-  @IBAction func predictAct(_ sender: Any) {
     
-    if let texture = toPredictTexture {
-      let beginDate = Date.init()
-      runner.predict(texture: texture) { [weak self] (success, resultHolder) in
-        if success, let inResultHolder = resultHolder {
-          let timeUse = Date.init().timeIntervalSince(beginDate)
-          DispatchQueue.main.async {
-            self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms"
-            self?.resultTextView.text = self?.net.resultStr(res: inResultHolder)
-          }
-          
+    @IBAction func predictAct(_ sender: Any) {
+        
+        if let texture = toPredictTexture {
+            let beginDate = Date.init()
+            runner.predict(texture: texture) { [weak self] (success, resultHolder) in
+                if success, let inResultHolder = resultHolder {
+                    let timeUse = Date.init().timeIntervalSince(beginDate)
+                    DispatchQueue.main.async {
+                        self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms"
+                        self?.resultTextView.text = self?.net.resultStr(res: inResultHolder)
+                    }
+                    
+                } else {
+                    print(" predict fail ")
+                }
+            }
         } else {
-          print(" predict fail ")
+            print(" toPredictTexture is nil ")
         }
-      }
-    } else {
-      print(" toPredictTexture is nil ")
+        
     }
     
-  }
-  
 }
 
 extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-    picker.dismiss(animated: true){[weak self] in
-      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else {
-        fatalError("no image")
-      }
-      sSelf.selectImageView.image = image
-      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
-        sSelf.toPredictTexture = texture
-      })
+    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+        picker.dismiss(animated: true){[weak self] in
+            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else {
+                fatalError("no image")
+            }
+            sSelf.selectImageView.image = image
+            sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+                sSelf.toPredictTexture = texture
+            })
+        }
     }
-  }
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
index 537fb06ed9e5b9100bea43b7acae9c014e0f4a78..557f5eef35c47e0a341223acfd8ec3ef8d77de31 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
@@ -16,36 +16,36 @@ import UIKit
 
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
-
+    
     var window: UIWindow?
-
+    
     func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
         // Override point for customization after application launch.
         return true
     }
-
+    
     func applicationWillResignActive(_ application: UIApplication) {
         // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
         // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
     }
-
+    
     func applicationDidEnterBackground(_ application: UIApplication) {
         // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
         // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
     }
-
+    
     func applicationWillEnterForeground(_ application: UIApplication) {
         // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
     }
-
+    
     func applicationDidBecomeActive(_ application: UIApplication) {
         // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
     }
-
+    
     func applicationWillTerminate(_ application: UIApplication) {
         // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
     }
-
-
+    
+    
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
index 88445bfdb42afadaf94cbb562bc83cae57954a39..d67403f27299c887922be6ced1fcb6a28bf89fbb 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="4MS-jc-i6A">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
     <device id="retina4_7" orientation="portrait">
         <adaptation id="fullscreen"/>
     </device>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
index ca19c166c34179fea5a9af3f30c8e3185daee03f..8252258c978a2b74a298389bec240eb256fc9126 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
@@ -18,14 +18,14 @@ import Foundation
 import paddle_mobile
 
 @objc public class MetalHelper: NSObject {
-  @objc let device: MTLDevice
-  @objc let queue: MTLCommandQueue
-  @objc let textureLoader: MTKTextureLoader
-  @objc static let shared: MetalHelper = MetalHelper.init()
-  private override init(){
-    device = MTLCreateSystemDefaultDevice()!
-    queue = device.makeCommandQueue()!
-    textureLoader = MTKTextureLoader.init(device: device)
-    super.init()
-  }
+    @objc let device: MTLDevice
+    @objc let queue: MTLCommandQueue
+    @objc let textureLoader: MTKTextureLoader
+    @objc static let shared: MetalHelper = MetalHelper.init()
+    private override init(){
+        device = MTLCreateSystemDefaultDevice()!
+        queue = device.makeCommandQueue()!
+        textureLoader = MTKTextureLoader.init(device: device)
+        super.init()
+    }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
index 22fb5723ac9e6f358f2632467389f277603fc59d..8af436d7796e445dc60d138927d07d7187db6bf6 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
@@ -16,51 +16,51 @@ import UIKit
 import paddle_mobile
 
 class MultiPredictViewController: UIViewController {
-  var runner1: Runner!
-  var runner2: Runner!
-  override func viewDidLoad() {
-    super.viewDidLoad()
-    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
-    let genet = Genet.init(device: MetalHelper.shared.device)
-    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue)
-    let queue2 = MetalHelper.shared.device.makeCommandQueue()
+    var runner1: Runner!
+    var runner2: Runner!
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
+        let genet = Genet.init(device: MetalHelper.shared.device)
+        runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue)
+        let queue2 = MetalHelper.shared.device.makeCommandQueue()
+        
+        runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue)
+    }
     
-    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue)
-  }
-
-  @IBAction func predictAct(_ sender: Any) {
-    let success = self.runner2.load()
-//    DispatchQueue.global().async {
-      let image1 = UIImage.init(named: "hand.jpg")
-//      let success = self.runner2.load()
-//      if success {
-//        for i in 0..<10000 {
-//          print(i)
-//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
-//            print("result1: ")
-////            print(res)
-//          })
-//        }
-//      } else {
-//        print("load failed")
-//      }
-//      self.runner1.clear()
-//    }
-//    return
-//    DispatchQueue.global().async {
-////      sleep(1)
-//      let image1 = UIImage.init(named: "banana.jpeg")
-////      if success {
-//        for _ in 0..<10 {
-//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
-//            print("result2: ")
-//            print(res)
-//          })
-//        }
-////      } else {
-////        print("load failed")
-////      }
-////      self.runner2.clear()
-//    }
-  }
+    @IBAction func predictAct(_ sender: Any) {
+        let success = self.runner2.load()
+        //    DispatchQueue.global().async {
+        let image1 = UIImage.init(named: "hand.jpg")
+        //      let success = self.runner2.load()
+        //      if success {
+        //        for i in 0..<10000 {
+        //          print(i)
+        //          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+        //            print("result1: ")
+        ////            print(res)
+        //          })
+        //        }
+        //      } else {
+        //        print("load failed")
+        //      }
+        //      self.runner1.clear()
+        //    }
+        //    return
+        //    DispatchQueue.global().async {
+        ////      sleep(1)
+        //      let image1 = UIImage.init(named: "banana.jpeg")
+        ////      if success {
+        //        for _ in 0..<10 {
+        //          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+        //            print("result2: ")
+        //            print(res)
+        //          })
+        //        }
+        ////      } else {
+        ////        print("load failed")
+        ////      }
+        ////      self.runner2.clear()
+        //    }
+    }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
index fac8af25278e8aa2350669fb4b921049a512e241..ddfc5f770d578dde5f345bcb5776bb1504078456 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm
@@ -20,30 +20,30 @@
 #import <algorithm>
 
 struct NMSParam {
-  
-  float *score_data;
-  
-  float *box_data;
-  
-  float *output;
-  
-  int output_size;
-  
-  std::vector<int> score_dim;
-  
-  std::vector<int> box_dim;
-  
-  float scoreThredshold;
-  
-  int nmsTopK;
-  
-  int keepTopK;
-  
-  float nmsEta;
-  
-  float nmsThreshold;
-  
-  int background_label;
+    
+    float *score_data;
+    
+    float *box_data;
+    
+    float *output;
+    
+    int output_size;
+    
+    std::vector<int> score_dim;
+    
+    std::vector<int> box_dim;
+    
+    float scoreThredshold;
+    
+    int nmsTopK;
+    
+    int keepTopK;
+    
+    float nmsEta;
+    
+    float nmsThreshold;
+    
+    int background_label;
 };
 
 
@@ -53,63 +53,63 @@ constexpr int kBBoxSize = 4;
 template <class T>
 bool SortScorePairDescend(const std::pair<float, T>& pair1,
                           const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
+    return pair1.first > pair2.first;
 }
 
 template <class T>
 static inline void GetMaxScoreIndex(
                                     const std::vector<T>& scores, const T threshold, int top_k,
                                     std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
+    for (size_t i = 0; i < scores.size(); ++i) {
+        if (scores[i] > threshold) {
+            sorted_indices->push_back(std::make_pair(scores[i], i));
+        }
+    }
+    // Sort the score pair according to the scores in descending order
+    std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                     SortScorePairDescend<int>);
+    // Keep top_k scores if needed.
+    if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+        sorted_indices->resize(top_k);
     }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
 }
 
 template <class T>
 static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
+    if (box[2] < box[0] || box[3] < box[1]) {
+        // If coordinate values are is invalid
+        // (e.g. xmax < xmin or ymax < ymin), return 0.
+        return static_cast<T>(0.);
     } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
+        const T w = box[2] - box[0];
+        const T h = box[3] - box[1];
+        if (normalized) {
+            return w * h;
+        } else {
+            // If coordinate values are not within range [0, 1].
+            return (w + 1) * (h + 1);
+        }
     }
-  }
 }
 
 template <class T>
 static inline T JaccardOverlap(const T* box1, const T* box2,
                                const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
+    if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+        box2[3] < box1[1]) {
+        return static_cast<T>(0.);
+    } else {
+        const T inter_xmin = std::max(box1[0], box2[0]);
+        const T inter_ymin = std::max(box1[1], box2[1]);
+        const T inter_xmax = std::min(box1[2], box2[2]);
+        const T inter_ymax = std::min(box1[3], box2[3]);
+        const T inter_w = inter_xmax - inter_xmin;
+        const T inter_h = inter_ymax - inter_ymin;
+        const T inter_area = inter_w * inter_h;
+        const T bbox1_area = BBoxArea<T>(box1, normalized);
+        const T bbox2_area = BBoxArea<T>(box2, normalized);
+        return inter_area / (bbox1_area + bbox2_area - inter_area);
+    }
 }
 
 template <typename T>
@@ -120,40 +120,40 @@ static inline void NMSFast(
                            const T score_threshold, const T nms_threshold,
                            const T eta, const int top_k,
                            std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int num_boxes = bbox_dim[0];
-  // 4: [xmin ymin xmax ymax]
-  int box_size = bbox_dim[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(score_data, num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
+    // The total boxes for each instance.
+    int num_boxes = bbox_dim[0];
+    // 4: [xmin ymin xmax ymax]
+    int box_size = bbox_dim[1];
+    
+    std::vector<T> scores_data(num_boxes);
+    std::copy_n(score_data, num_boxes, scores_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+    
+    selected_indices->clear();
+    T adaptive_threshold = nms_threshold;
+    
+    while (sorted_indices.size() != 0) {
+        const int idx = sorted_indices.front().second;
+        bool keep = true;
+        for (size_t k = 0; k < selected_indices->size(); ++k) {
+            if (keep) {
+                const int kept_idx = (*selected_indices)[k];
+                T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                              bbox_data + kept_idx * box_size, true);
+                keep = overlap <= adaptive_threshold;
+            } else {
+                break;
+            }
+        }
+        if (keep) {
+            selected_indices->push_back(idx);
+        }
+        sorted_indices.erase(sorted_indices.begin());
+        if (keep && eta < 1 && adaptive_threshold > 0.5) {
+            adaptive_threshold *= eta;
+        }
     }
-  }
 }
 
 template <typename T>
@@ -165,48 +165,48 @@ void MultiClassNMS(const T *boxes_data,
                    const int& background_label, const int& nms_top_k,
                    const int& keep_top_k, const T& nms_threshold,
                    const T& nms_eta, const T& score_threshold) {
-  
-  int64_t class_num = score_dim[0];
-  int64_t predict_dim = score_dim[1];
-  int num_det = 0;
-  for (int c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    const T *score_data = scores_data + c * predict_dim;
     
-    /// [c] is key
-    NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
+    int64_t class_num = score_dim[0];
+    int64_t predict_dim = score_dim[1];
+    int num_det = 0;
+    for (int c = 0; c < class_num; ++c) {
+        if (c == background_label) continue;
+        const T *score_data = scores_data + c * predict_dim;
+        
+        /// [c] is key
+        NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
                    nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
+        num_det += (*indices)[c].size();
     }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
+    
+    *num_nmsed_out = num_det;
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+        std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
+        for (const auto& it : *indices) {
+            int label = it.first;
+            const T* sdata = scores_data + label * predict_dim;
+            const std::vector<int>& label_indices = it.second;
+            for (size_t j = 0; j < label_indices.size(); ++j) {
+                int idx = label_indices[j];
+                // PADDLE_ENFORCE_LT(idx, predict_dim);
+                score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
+            }
+        }
+        // Keep top k results per image.
+        std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                         SortScorePairDescend<std::pair<int, int>>);
+        score_index_pairs.resize(keep_top_k);
+        
+        // Store the new indices.
+        std::map<int, std::vector<int>> new_indices;
+        for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+            int label = score_index_pairs[j].second.first;
+            int idx = score_index_pairs[j].second.second;
+            new_indices[label].push_back(idx);
+        }
+        new_indices.swap(*indices);
+        *num_nmsed_out = keep_top_k;
     }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
 }
 
 template <typename T>
@@ -215,69 +215,69 @@ void MultiClassOutput(const T *scores_data,
                       const T *bboxes_data,
                       T *outputs_data,
                       const std::map<int, std::vector<int>>& selected_indices) {
-  int predict_dim = score_dim[1];
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * kBBoxSize;
-      outputs_data[count * kOutputDim] = label;           // label
-      outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-      count++;
+    int predict_dim = score_dim[1];
+    int count = 0;
+    for (const auto& it : selected_indices) {
+        /// one batch
+        int label = it.first;
+        const T* sdata = scores_data + label * predict_dim;
+        const std::vector<int>& indices = it.second;
+        for (size_t j = 0; j < indices.size(); ++j) {
+            int idx = indices[j];
+            const T* bdata = bboxes_data + idx * kBBoxSize;
+            outputs_data[count * kOutputDim] = label;           // label
+            outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
+            // xmin, ymin, xmax, ymax
+            std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+            count++;
+        }
     }
-  }
 }
 
 void MultiClassNMSCompute(NMSParam *param) {
-  assert(param->score_dim[0] == 1);
-  assert(param->box_dim[0] == 1);
-  assert (param->score_dim.size() == 3);
-  assert(param->box_dim.size() == 3);
-
-  float* outputs;
-  auto background_label = param->background_label;
-  auto nms_top_k = param->nmsTopK;
-  auto keep_top_k = param->keepTopK;
-  auto nms_threshold = param->nmsThreshold;
-  auto nms_eta = param->nmsEta;
-  auto score_threshold = param->scoreThredshold;
-
-  std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
-  std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
-  
-  std::vector<int> batch_starts = {0};
-  
-  std::map<int, std::vector<int>> indices;
-  int num_nmsed_out = 0;
-  
-  MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
-                       background_label, nms_top_k, keep_top_k, nms_threshold,
-                       nms_eta, score_threshold);
-  batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-
-  int output_size = 0;
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    outputs = new float[1];
-    outputs[0] = -1;
-    output_size = 1;
-  } else {
-    outputs = new float[num_kept * kOutputDim];
-    int64_t s = batch_starts[0];
-    int64_t e = batch_starts[1];
-    if (e > s) {
-      MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
+    assert(param->score_dim[0] == 1);
+    assert(param->box_dim[0] == 1);
+    assert (param->score_dim.size() == 3);
+    assert(param->box_dim.size() == 3);
+    
+    float* outputs;
+    auto background_label = param->background_label;
+    auto nms_top_k = param->nmsTopK;
+    auto keep_top_k = param->keepTopK;
+    auto nms_threshold = param->nmsThreshold;
+    auto nms_eta = param->nmsEta;
+    auto score_threshold = param->scoreThredshold;
+    
+    std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
+    std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
+    
+    std::vector<int> batch_starts = {0};
+    
+    std::map<int, std::vector<int>> indices;
+    int num_nmsed_out = 0;
+    
+    MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
+                         background_label, nms_top_k, keep_top_k, nms_threshold,
+                         nms_eta, score_threshold);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    
+    int output_size = 0;
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+        outputs = new float[1];
+        outputs[0] = -1;
+        output_size = 1;
+    } else {
+        outputs = new float[num_kept * kOutputDim];
+        int64_t s = batch_starts[0];
+        int64_t e = batch_starts[1];
+        if (e > s) {
+            MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
+        }
+        output_size = num_kept * kOutputDim;
     }
-    output_size = num_kept * kOutputDim;
-  }
-  param->output = outputs;
-  param->output_size = output_size;
+    param->output = outputs;
+    param->output_size = output_size;
 }
 
 @implementation CPUResult
@@ -286,31 +286,31 @@ void MultiClassNMSCompute(NMSParam *param) {
 @implementation NMSCompute
 
 -(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
-  NMSParam param;
-  param.box_data = bbox;
-  param.score_data = score;
-  param.background_label = self.background_label;
-  param.scoreThredshold = self.scoreThredshold;
-  param.nmsTopK = self.nmsTopK;
-  param.keepTopK = self.keepTopK;
-  param.nmsEta = self.nmsEta;
-  param.nmsThreshold = self.nmsThreshold;
-  std::vector<int> score_dim;
-  for (int i = 0; i < self.scoreDim.count; ++i) {
-    score_dim.push_back(self.scoreDim[i].intValue);
-  }
-  param.score_dim = score_dim;
-  
-  std::vector<int> box_dim;
-  for (int i = 0; i < self.bboxDim.count; ++i) {
-    box_dim.push_back(self.bboxDim[i].intValue);
-  }
-  param.box_dim = box_dim;
-  MultiClassNMSCompute(&param);
-  CPUResult *cr = [[CPUResult alloc] init];
-  cr.output = param.output;
-  cr.outputSize = param.output_size;
-  return cr;
+    NMSParam param;
+    param.box_data = bbox;
+    param.score_data = score;
+    param.background_label = self.background_label;
+    param.scoreThredshold = self.scoreThredshold;
+    param.nmsTopK = self.nmsTopK;
+    param.keepTopK = self.keepTopK;
+    param.nmsEta = self.nmsEta;
+    param.nmsThreshold = self.nmsThreshold;
+    std::vector<int> score_dim;
+    for (int i = 0; i < self.scoreDim.count; ++i) {
+        score_dim.push_back(self.scoreDim[i].intValue);
+    }
+    param.score_dim = score_dim;
+    
+    std::vector<int> box_dim;
+    for (int i = 0; i < self.bboxDim.count; ++i) {
+        box_dim.push_back(self.bboxDim[i].intValue);
+    }
+    param.box_dim = box_dim;
+    MultiClassNMSCompute(&param);
+    CPUResult *cr = [[CPUResult alloc] init];
+    cr.output = param.output;
+    cr.outputSize = param.output_size;
+    return cr;
 }
 
 @end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
index 91bf014e9f762c643a6c482e81de6f6d35c995b0..b248e53bac56ba2018b029406486a29bb52e224f 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift
@@ -16,37 +16,37 @@ import Foundation
 import paddle_mobile
 
 public class Genet: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
-    preprocessKernel = GenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 128, 128, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device: device,
-               inParamPointer: inParamPointer,
-               inParamSize: inParamSize,
-               inModelPointer: inModelPointer,
-               inModelSize: inModelSize)
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    preprocessKernel = GenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 128, 128, 3])
-  }
-
-  class GenetPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
-      super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
+        preprocessKernel = GenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 128, 128, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
+    
+    @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device: device,
+                   inParamPointer: inParamPointer,
+                   inParamSize: inParamSize,
+                   inModelPointer: inModelPointer,
+                   inModelSize: inModelSize)
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        preprocessKernel = GenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 128, 128, 3])
+    }
+    
+    class GenetPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
+            super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
+    }
+    
+    override  public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0]) ... "
     }
-  }
-  
-  override  public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0]) ... "
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
index d35fde97d7e0da67aa446ba7f8f7a33db265e402..608cd3180b0dedabafecb72baf98bf289163de20 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
@@ -16,53 +16,53 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet: Net{
-  
-  class MobilenetPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
-    }
-  }
-  
-  class PreWords {
-    var contents: [String] = []
-    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-        let string = try! String.init(contentsOfFile: filePath)
-        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+    
+    class MobilenetPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
         }
-      }else{
-        fatalError("no file call \(fileName)")
-      }
     }
-    subscript(index: Int) -> String {
-      return contents[index]
+    
+    class PreWords {
+        var contents: [String] = []
+        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+                let string = try! String.init(contentsOfFile: filePath)
+                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
+                }
+            }else{
+                fatalError("no file call \(fileName)")
+            }
+        }
+        subscript(index: Int) -> String {
+            return contents[index]
+        }
     }
-  }
-  
-  let labels = PreWords.init(fileName: "synset")
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    let resPointer = res[0].result
-    var s: [String] = []
-    (0..<res[0].capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
-      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    
+    let labels = PreWords.init(fileName: "synset")
+    
+    override public func resultStr(res: [ResultHolder]) -> String {
+        let resPointer = res[0].result
+        var s: [String] = []
+        (0..<res[0].capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+        }
+        return s.joined(separator: "\n")
     }
-    return s.joined(separator: "\n")
-  }
     
-  override public init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"    
-//    metalLoadMode = .LoadMetalInCustomMetalLib
-//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
-    preprocessKernel = MobilenetPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
+    override public init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"    
+        //    metalLoadMode = .LoadMetalInCustomMetalLib
+        //    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
+        preprocessKernel = MobilenetPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
index 1ede49826d68d462676997d9dbb53a70cb52250a..1e644c3d54d0b7dafe2e3f98deabff62f64f153c 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
@@ -16,18 +16,18 @@ import Foundation
 import paddle_mobile
 
 public class MobileNetCombined: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null"
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  override  public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0]) ... "
-  }
-  
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null"
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
+    
+    override  public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0]) ... "
+    }
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
index 140aefdfb30c3b7b614b5db42ddc9fa8ac78b025..38d20557d2a51dd7b9943296b5fc13dc382ebba5 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift
@@ -16,84 +16,84 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet_ssd_hand: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 2
-    modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 300, 300, 3])
-  }
-  
-  @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize)
-    except = 2
-    modelPath = ""
-    paramPath = ""
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 300, 300, 3])
-  }
-  
-  class MobilenetssdPreProccess: CusomKernel {
-    init(device: MTLDevice) {
-      let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 2
+        modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 300, 300, 3])
     }
-  }
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0])"
-  }
-  
-  override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-
-//    guard let interRes = paddleMobileRes.intermediateResults else {
-//      fatalError(" need have inter result ")
-//    }
-//
-//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
-//      fatalError(" need score ")
-//    }
-//
-//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
-//      fatalError()
-//    }
-//
-//    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
-////    print("score: ")
-////    print(scoreFormatArr.strideArray())
-////
-//    var bboxArr = bbox.metalTexture.float32Array()
-////    print("bbox: ")
-////    print(bboxArr.strideArray())
-//
-//    let nmsCompute = NMSCompute.init()
-//    nmsCompute.scoreThredshold = 0.01
-//    nmsCompute.nmsTopK = 400
-//    nmsCompute.keepTopK = 200
-//    nmsCompute.nmsEta = 1.0
-//    nmsCompute.nmsThreshold = 0.45
-//    nmsCompute.background_label = 0;
-//
-//    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
-//
-//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
-//    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
-//      fatalError( " result error " )
-//    }
-//
-//    let output: [Float32] = result.map { $0.floatValue }
-//
-//
-//    return output
-    fatalError()
-  }
-  
-
-  
- 
+    
+    @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize)
+        except = 2
+        modelPath = ""
+        paramPath = ""
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 300, 300, 3])
+    }
+    
+    class MobilenetssdPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
+    }
+    
+    override public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0])"
+    }
+    
+    override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
+        
+        //    guard let interRes = paddleMobileRes.intermediateResults else {
+        //      fatalError(" need have inter result ")
+        //    }
+        //
+        //    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+        //      fatalError(" need score ")
+        //    }
+        //
+        //    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+        //      fatalError()
+        //    }
+        //
+        //    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
+        ////    print("score: ")
+        ////    print(scoreFormatArr.strideArray())
+        ////
+        //    var bboxArr = bbox.metalTexture.float32Array()
+        ////    print("bbox: ")
+        ////    print(bboxArr.strideArray())
+        //
+        //    let nmsCompute = NMSCompute.init()
+        //    nmsCompute.scoreThredshold = 0.01
+        //    nmsCompute.nmsTopK = 400
+        //    nmsCompute.keepTopK = 200
+        //    nmsCompute.nmsEta = 1.0
+        //    nmsCompute.nmsThreshold = 0.45
+        //    nmsCompute.background_label = 0;
+        //
+        //    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+        //
+        //    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+        //    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+        //      fatalError( " result error " )
+        //    }
+        //
+        //    let output: [Float32] = result.map { $0.floatValue }
+        //
+        //
+        //    return output
+        fatalError()
+    }
+    
+    
+    
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
index 134a07bba63837dc8c9984ef554d5033d937fc87..76feb0ecd07cd3b7d5405e32d674f695a629aa06 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift
@@ -16,137 +16,137 @@ import Foundation
 import paddle_mobile
 
 public class MobileNet_ssd_AR: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 2
-    modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 160, 160, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize)
-    except = 2
-    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    inputDim = Dim.init(inDim: [1, 160, 160, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  class MobilenetssdPreProccess: CusomKernel {
-    init(device: MTLDevice)  {
-      let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 2
+        modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 160, 160, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
     }
-  }
-  
-  override public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0])"
-  }
-  
-  override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-    fatalError()
-//    guard let interRes = paddleMobileRes.intermediateResults else {
-//      fatalError(" need have inter result ")
-//    }
-//
-//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
-//      fatalError(" need score ")
-//    }
-//
-//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
-//      fatalError()
-//    }
     
-//    let startDate = Date.init()
+    @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize)
+        except = 2
+        preprocessKernel = MobilenetssdPreProccess.init(device: device)
+        inputDim = Dim.init(inDim: [1, 160, 160, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
     
-//    print("scoreFormatArr: ")
-//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
-//
-//    print("bbox arr: ")
-//
-//    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
+    class MobilenetssdPreProccess: CusomKernel {
+        init(device: MTLDevice)  {
+            let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+        }
+    }
     
-//    let nmsCompute = NMSCompute.init()
-//    nmsCompute.scoreThredshold = 0.25
-//    nmsCompute.nmsTopK = 100
-//    nmsCompute.keepTopK = 100
-//    nmsCompute.nmsEta = 1.0
-//    nmsCompute.nmsThreshold = 0.449999988
-//    nmsCompute.background_label = 0;
-//    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
-//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
-//    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
-//      fatalError( " result error " )
-//    }
-//    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
-//    for i in 0..<Int(result.outputSize) {
-//
-//      print("i \(i) : \(result.output[i])")
-//    }
-//    print(Date.init().timeIntervalSince(startDate))
-
-//    print(resultHolder.result![0])
-//    return resultHolder
-  }
-  
-//  override func updateProgram(program: Program) {
-  
-//    for i in [56, 66, 76, 86, 93, 99] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      let output = opDesc.outputs["Out"]!.first!
-//      let v = program.scope[output]!
-//      let originTexture = v as! Texture
-//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
-//      
-//      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
-//      
-//      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
-//      
-//      program.scope[output] = originTexture
-//      
-//      if i == 99 {
-//        opDesc.attrs["axis"] = 0
-//      } else {
-//        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
-//      }
-//    }
-//    
-//    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      let output = opDesc.outputs["Out"]!.first!
-//      let v = program.scope[output]!
-//      
-//      
-//      
-//      let originTexture = v as! Texture
-//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-//      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
-//    }
-//    
-//    for i in [60, 101, 90, 97, 70, 80] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      let output = opDesc.outputs["Out"]!.first!
-//      let v = program.scope[output]!
-//      let originTexture = v as! Texture
-//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-//      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
-//    }
-//    
-//    for i in [102] {
-//      let opDesc = program.programDesc.blocks[0].ops[i]
-//      for output in opDesc.outputs["Out"]! {
-//        let v = program.scope[output]!
-//        let originTexture = v as! Texture
-//        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-//      }
-//      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
-//      print(" split axis \(opDesc.attrs["axis"])")
-//    }
+    override public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0])"
+    }
+    
+    override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
+        fatalError()
+        //    guard let interRes = paddleMobileRes.intermediateResults else {
+        //      fatalError(" need have inter result ")
+        //    }
+        //
+        //    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
+        //      fatalError(" need score ")
+        //    }
+        //
+        //    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
+        //      fatalError()
+        //    }
+        
+        //    let startDate = Date.init()
+        
+        //    print("scoreFormatArr: ")
+        //print((0..<score.capacity).map{ score.result[$0] }.strideArray())
+        //
+        //    print("bbox arr: ")
+        //
+        //    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
+        
+        //    let nmsCompute = NMSCompute.init()
+        //    nmsCompute.scoreThredshold = 0.25
+        //    nmsCompute.nmsTopK = 100
+        //    nmsCompute.keepTopK = 100
+        //    nmsCompute.nmsEta = 1.0
+        //    nmsCompute.nmsThreshold = 0.449999988
+        //    nmsCompute.background_label = 0;
+        //    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
+        //    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
+        //    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
+        //      fatalError( " result error " )
+        //    }
+        //    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+        //    for i in 0..<Int(result.outputSize) {
+        //
+        //      print("i \(i) : \(result.output[i])")
+        //    }
+        //    print(Date.init().timeIntervalSince(startDate))
+        
+        //    print(resultHolder.result![0])
+        //    return resultHolder
+    }
+    
+    //  override func updateProgram(program: Program) {
+    
+    //    for i in [56, 66, 76, 86, 93, 99] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      let output = opDesc.outputs["Out"]!.first!
+    //      let v = program.scope[output]!
+    //      let originTexture = v as! Texture
+    //      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
+    //      
+    //      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
+    //      
+    //      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
+    //      
+    //      program.scope[output] = originTexture
+    //      
+    //      if i == 99 {
+    //        opDesc.attrs["axis"] = 0
+    //      } else {
+    //        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    //      }
+    //    }
+    //    
+    //    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      let output = opDesc.outputs["Out"]!.first!
+    //      let v = program.scope[output]!
+    //      
+    //      
+    //      
+    //      let originTexture = v as! Texture
+    //      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+    //      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    //    }
+    //    
+    //    for i in [60, 101, 90, 97, 70, 80] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      let output = opDesc.outputs["Out"]!.first!
+    //      let v = program.scope[output]!
+    //      let originTexture = v as! Texture
+    //      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+    //      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    //    }
+    //    
+    //    for i in [102] {
+    //      let opDesc = program.programDesc.blocks[0].ops[i]
+    //      for output in opDesc.outputs["Out"]! {
+    //        let v = program.scope[output]!
+    //        let originTexture = v as! Texture
+    //        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+    //      }
+    //      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    //      print(" split axis \(opDesc.attrs["axis"])")
+    //    }
     // 99
-//  }
-  
+    //  }
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
index 9bd2c26e34ab1e5d5ddcb8e15cf8bb46df080a65..99bd8f4a03d6e58d23595d7a8effa48c7ebf5919 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
@@ -17,9 +17,9 @@ using namespace metal;
 
 
 kernel void mobilenet_preprocess(
-                       texture2d<float, access::read> inTexture [[texture(0)]],
-                       texture2d<float, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
+                                 texture2d<float, access::read> inTexture [[texture(0)]],
+                                 texture2d<float, access::write> outTexture [[texture(1)]],
+                                 uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -31,9 +31,9 @@ kernel void mobilenet_preprocess(
 }
 
 kernel void mobilenet_preprocess_half(
-                       texture2d<half, access::read> inTexture [[texture(0)]],
-                       texture2d<half, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
+                                      texture2d<half, access::read> inTexture [[texture(0)]],
+                                      texture2d<half, access::write> outTexture [[texture(1)]],
+                                      uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -45,9 +45,9 @@ kernel void mobilenet_preprocess_half(
 }
 
 kernel void mobilenet_ssd_preprocess(
-                       texture2d<float, access::read> inTexture [[texture(0)]],
-                       texture2d<float, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
+                                     texture2d<float, access::read> inTexture [[texture(0)]],
+                                     texture2d<float, access::write> outTexture [[texture(1)]],
+                                     uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -59,9 +59,9 @@ kernel void mobilenet_ssd_preprocess(
 }
 
 kernel void mobilenet_ssd_preprocess_half(
-                            texture2d<half, access::read> inTexture [[texture(0)]],
-                            texture2d<half, access::write> outTexture [[texture(1)]],
-                            uint2 gid [[thread_position_in_grid]])
+                                          texture2d<half, access::read> inTexture [[texture(0)]],
+                                          texture2d<half, access::write> outTexture [[texture(1)]],
+                                          uint2 gid [[thread_position_in_grid]])
 {
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height()) {
@@ -74,44 +74,44 @@ kernel void mobilenet_ssd_preprocess_half(
 
 kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
-  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void genet_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
-  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void mobilent_ar_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
-  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
 
 kernel void mobilent_ar_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
 {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
-  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
index f5f4ef81e9fcb8f8c21c8cadd8506fa884d18ce5..caaef97695f4cdecf61568e4f5aba6b5afcffcc8 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
@@ -17,18 +17,18 @@ import Foundation
 import paddle_mobile
 
 public class YoloNet: Net {
-  @objc public override init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null"
-    inputDim = Dim.init(inDim: [1, 416, 416, 3])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-
-  override  public func resultStr(res: [ResultHolder]) -> String {
-    return " \(res[0].result[0]) ... "
-  }
-  
+    @objc public override init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null"
+        inputDim = Dim.init(inDim: [1, 416, 416, 3])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
+    
+    override  public func resultStr(res: [ResultHolder]) -> String {
+        return " \(res[0].result[0]) ... "
+    }
+    
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
index 586fc91a7f46904ca61c8eaefcbec42098c3d0a8..5bef9317b173d94c40008bf60c98c32a01f32dd2 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m
@@ -34,83 +34,83 @@
 @implementation LoadPointerViewController
 
 - (void)viewDidLoad {
-  [super viewDidLoad];
-  
-  self.imageView.image = [UIImage imageNamed:@"banana.jpeg"];
-  
-  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path;
-  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path;
-
-  long fileSize;
-  FILE *fp;
-  fp = fopen([modelPath UTF8String], "rb");
-  fseek(fp, 0, SEEK_END);
-  fileSize = ftell(fp);
-  rewind(fp);
-  void *buffer = malloc(fileSize);
-  fread(buffer, 1, fileSize, fp);
-  fclose(fp);
-  
-  long paramfileSize;
-  FILE *parmaFilePointer;
-  parmaFilePointer = fopen([paramPath UTF8String], "rb");
-  fseek(parmaFilePointer, 0, SEEK_END);
-  paramfileSize = ftell(parmaFilePointer);
-  rewind(parmaFilePointer);
-  void *parmaBuffer = malloc(paramfileSize);
-  fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
-  fclose(parmaFilePointer);
-  
-  _modelConfig = [[ModelConfig alloc] init];
-  _modelConfig.modelPointer = buffer;
-  _modelConfig.modelSize = (int)fileSize;
-  _modelConfig.paramPointer = parmaBuffer;
-  _modelConfig.paramSize = (int)paramfileSize;
+    [super viewDidLoad];
+    
+    self.imageView.image = [UIImage imageNamed:@"banana.jpeg"];
+    
+    NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path;
+    NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path;
+    
+    long fileSize;
+    FILE *fp;
+    fp = fopen([modelPath UTF8String], "rb");
+    fseek(fp, 0, SEEK_END);
+    fileSize = ftell(fp);
+    rewind(fp);
+    void *buffer = malloc(fileSize);
+    fread(buffer, 1, fileSize, fp);
+    fclose(fp);
+    
+    long paramfileSize;
+    FILE *parmaFilePointer;
+    parmaFilePointer = fopen([paramPath UTF8String], "rb");
+    fseek(parmaFilePointer, 0, SEEK_END);
+    paramfileSize = ftell(parmaFilePointer);
+    rewind(parmaFilePointer);
+    void *parmaBuffer = malloc(paramfileSize);
+    fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
+    fclose(parmaFilePointer);
+    
+    _modelConfig = [[ModelConfig alloc] init];
+    _modelConfig.modelPointer = buffer;
+    _modelConfig.modelSize = (int)fileSize;
+    _modelConfig.paramPointer = parmaBuffer;
+    _modelConfig.paramSize = (int)paramfileSize;
 }
 - (IBAction)loaderButtonPressed:(id)sender {
-  self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig];
-  _loaded = [self.paddleMobile load];
-  NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败");
+    self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig];
+    _loaded = [self.paddleMobile load];
+    NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败");
 }
 - (IBAction)predictButtonPressed:(id)sender {
-  [self predict];
+    [self predict];
 }
 
 - (void)predict {
-  UIImage *image = self.imageView.image;
-  if (!image) {
-    NSLog(@" image is nil");
-    return;
-  }
-  id<MTLTexture> texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil];
-  _texture = texture;
-  if (!_texture) {
-    NSLog(@" texture is nil");
-    return;
-  }
-  
-  if (!self.loaded) {
-    NSLog(@" not load ");
-    return;
-  }
-  
-  NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
-  NSInteger max = 1;
-  for (int i = 0;i < max; i ++) {
-    [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
-      if (success) {
-        if (i == max -1) {
-          double time = [[NSDate date] timeIntervalSince1970] - startTime;
-          time = (time/max)*1000;
-          NSLog(@"gap ==== %fms",time);
-        }
-      }
-    }];
-  }
+    UIImage *image = self.imageView.image;
+    if (!image) {
+        NSLog(@" image is nil");
+        return;
+    }
+    id<MTLTexture> texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil];
+    _texture = texture;
+    if (!_texture) {
+        NSLog(@" texture is nil");
+        return;
+    }
+    
+    if (!self.loaded) {
+        NSLog(@" not load ");
+        return;
+    }
+    
+    NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
+    NSInteger max = 1;
+    for (int i = 0;i < max; i ++) {
+        [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
+            if (success) {
+                if (i == max -1) {
+                    double time = [[NSDate date] timeIntervalSince1970] - startTime;
+                    time = (time/max)*1000;
+                    NSLog(@"gap ==== %fms",time);
+                }
+            }
+        }];
+    }
 }
 - (IBAction)clear:(id)sender {
-  [self.paddleMobile clear];
-  self.loaded = NO;
+    [self.paddleMobile clear];
+    self.loaded = NO;
 }
 
 @end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
index cd99ddad430902d1403c0b2e1752c15fa4f21722..d45d7daaa19d4338238398981285567538bd1d0b 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h
@@ -16,8 +16,8 @@
 #import <Foundation/Foundation.h>
 
 typedef enum : NSUInteger {
-  SuperResolutionNetType,
-  MobileNetSSDType
+    SuperResolutionNetType,
+    MobileNetSSDType
 } NetType;
 
 @interface PaddleMobileGPUResult: NSObject
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
index 670753fd9f99aae2b6064440bdc42538bd6648a4..881a6cb5059cd2e30bb78bca6be33beec20c29b2 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m
@@ -30,75 +30,75 @@
 
 @implementation PaddleMobileGPUResult
 - (void)setOutputResult:(ResultHolder *)resultHolder {
-  self.resultHolder = resultHolder;
-  self.output = resultHolder.result;
-  self.outputSize = resultHolder.capacity;
+    self.resultHolder = resultHolder;
+    self.output = resultHolder.result;
+    self.outputSize = resultHolder.capacity;
 }
 
 -(void)releaseOutput {
-  [self.resultHolder releasePointer];
+    [self.resultHolder releasePointer];
 }
 @end
 
 @interface PaddleMobileGPU ()
 {
-  Runner *runner;
+    Runner *runner;
 }
 @end
 
 @implementation PaddleMobileGPU
 
 -(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config {
-  self = [super init];
-  if (self) {
-    Net *net = nil;
-    if (netType == SuperResolutionNetType) {
-      net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
-    } else if (netType == MobileNetSSDType) {
-      net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
+    self = [super init];
+    if (self) {
+        Net *net = nil;
+        if (netType == SuperResolutionNetType) {
+            net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
+        } else if (netType == MobileNetSSDType) {
+            net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize];
+        }
+        runner = [[Runner alloc] initInNet:net commandQueue:queue];
     }
-    runner = [[Runner alloc] initInNet:net commandQueue:queue];
-  }
-  return self;
+    return self;
 }
 
 -(BOOL)load {
-  return [runner load];
+    return [runner load];
 }
 
 -(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSArray <NSNumber *>*> *))completion {
-  
-  [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
-    NSMutableArray<NSMutableArray <NSNumber *>*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count];
-    for (int i = 0; i < resultArr.count; ++i) {
-      ResultHolder *resultHolder = resultArr[i];
-      NSMutableArray <NSNumber *>*res = [NSMutableArray arrayWithCapacity:resultHolder.capacity];
-      for (int j = 0; j < resultHolder.capacity; ++j) {
-        [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]];
-      }
-      [ocResultArray addObject:res];
-      [resultHolder releasePointer];
-    }
-    completion(success, ocResultArray);
-  }];
+    
+    [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
+        NSMutableArray<NSMutableArray <NSNumber *>*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count];
+        for (int i = 0; i < resultArr.count; ++i) {
+            ResultHolder *resultHolder = resultArr[i];
+            NSMutableArray <NSNumber *>*res = [NSMutableArray arrayWithCapacity:resultHolder.capacity];
+            for (int j = 0; j < resultHolder.capacity; ++j) {
+                [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]];
+            }
+            [ocResultArray addObject:res];
+            [resultHolder releasePointer];
+        }
+        completion(success, ocResultArray);
+    }];
 }
 
 -(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, NSArray <PaddleMobileGPUResult *> *))completion {
-  [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
-    NSMutableArray <PaddleMobileGPUResult *> *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count];
-    for (int i = 0; i < resultArr.count; ++i) {
-      ResultHolder *result = resultArr[i];
-      PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
-      gpuResult.dim = result.dim;
-      [gpuResult setOutputResult:result];
-      [ocResultArr addObject:gpuResult];
-    }
-    completion(success, ocResultArr);
-  }];
+    [runner predictWithTexture:texture completion:^(BOOL success, NSArray<ResultHolder *> * _Nullable resultArr) {
+        NSMutableArray <PaddleMobileGPUResult *> *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count];
+        for (int i = 0; i < resultArr.count; ++i) {
+            ResultHolder *result = resultArr[i];
+            PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
+            gpuResult.dim = result.dim;
+            [gpuResult setOutputResult:result];
+            [ocResultArr addObject:gpuResult];
+        }
+        completion(success, ocResultArr);
+    }];
 }
 
 -(void)clear {
-  [runner clear];
+    [runner clear];
 }
 
 @end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
index d2bebb2668b4e21d5b7ca4ebc68c037c9b9458b7..50dd29095e19c3cbe4f25e18ecd2690cddea1027 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
@@ -16,57 +16,57 @@ import Foundation
 import paddle_mobile
 
 @objc public class SuperResolutionNet: Net{
-  override public func resultStr(res: [ResultHolder]) -> String {
-    return "未实现"
-  }
-  
-  public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    super.init(device: device)
-    except = 0
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-    inputDim = Dim.init(inDim: [1, 224, 224, 3])
-    self.paramPointer = inParamPointer
-    self.paramSize = inParamSize
-    self.modelPointer = inModelPointer
-    self.modelSize = inModelSize
-  }
+    override public func resultStr(res: [ResultHolder]) -> String {
+        return "未实现"
+    }
+    
+    public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        super.init(device: device)
+        except = 0
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        inputDim = Dim.init(inDim: [1, 224, 224, 3])
+        self.paramPointer = inParamPointer
+        self.paramSize = inParamSize
+        self.modelPointer = inModelPointer
+        self.modelSize = inModelSize
+    }
+    
+    @objc override public init(device: MTLDevice) {
+        super.init(device: device)
+        except = 0
+        modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null"
+        preprocessKernel = nil
+        inputDim = Dim.init(inDim: [1, 224, 224, 1])
+        metalLoadMode = .LoadMetalInCustomMetalLib
+        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+    }
     
-  @objc override public init(device: MTLDevice) {
-    super.init(device: device)
-    except = 0
-    modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null"
-    preprocessKernel = nil
-    inputDim = Dim.init(inDim: [1, 224, 224, 1])
-    metalLoadMode = .LoadMetalInCustomMetalLib
-    metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-  }
-  
-  override public func updateProgram(program: Program) {
-    // n h w c
-    for block in program.programDesc.blocks {
-      for varDesc in block.vars {
-        if !varDesc.persistable {
-          if varDesc.type == .LodTensor {
-            let varEle = program.scope.vars[varDesc.name]
-            if let texture = varEle as? Texture {
-              let newDim = Dim.init(inDim: [texture.dim[0],  inputDim[1], inputDim[2], texture.tensorDim[1]])
-              print(" var desc name " + varDesc.name + " new dim" + "\(newDim)")
-              
-              texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim)
-              texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
-              
-              let output: FetchHolder = program.scope.output() as! FetchHolder
-              output.dim = newDim
-              output.capacity = newDim.numel()
-              output.paddedCapacity = newDim.numel() * 4
-              output.initBuffer(device: device)
+    override public func updateProgram(program: Program) {
+        // n h w c
+        for block in program.programDesc.blocks {
+            for varDesc in block.vars {
+                if !varDesc.persistable {
+                    if varDesc.type == .LodTensor {
+                        let varEle = program.scope.vars[varDesc.name]
+                        if let texture = varEle as? Texture {
+                            let newDim = Dim.init(inDim: [texture.dim[0],  inputDim[1], inputDim[2], texture.tensorDim[1]])
+                            print(" var desc name " + varDesc.name + " new dim" + "\(newDim)")
+                            
+                            texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim)
+                            texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
+                            
+                            let output: FetchHolder = program.scope.output() as! FetchHolder
+                            output.dim = newDim
+                            output.capacity = newDim.numel()
+                            output.paddedCapacity = newDim.numel() * 4
+                            output.initBuffer(device: device)
+                        }
+                    }
+                }
             }
-          }
         }
-      }
     }
-  }
 }
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
index f9e841f9c2a3060e775726023b6d5cfc3eeb679d..0080aa80f69cdbca5b132cd3019f2d9bedac3397 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
@@ -4,28 +4,28 @@ import Foundation
 import QuartzCore
 
 public class FPSCounter {
-  private(set) public var fps: Double = 0
-
-  var frames = 0
-  var startTime: CFTimeInterval = 0
-
-  public func start() {
-    frames = 0
-    startTime = CACurrentMediaTime()
-  }
-
-  public func frameCompleted() {
-    frames += 1
-    let now = CACurrentMediaTime()
-    let elapsed = now - startTime
-    if elapsed > 0.1 {
-      let current = Double(frames) / elapsed
-      let smoothing = 0.75
-      fps = smoothing*fps + (1 - smoothing)*current
-      if elapsed > 1 {
+    private(set) public var fps: Double = 0
+    
+    var frames = 0
+    var startTime: CFTimeInterval = 0
+    
+    public func start() {
         frames = 0
         startTime = CACurrentMediaTime()
-      }
     }
-  }
+    
+    public func frameCompleted() {
+        frames += 1
+        let now = CACurrentMediaTime()
+        let elapsed = now - startTime
+        if elapsed > 0.1 {
+            let current = Double(frames) / elapsed
+            let smoothing = 0.75
+            fps = smoothing*fps + (1 - smoothing)*current
+            if elapsed > 1 {
+                frames = 0
+                startTime = CACurrentMediaTime()
+            }
+        }
+    }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
index c235ed2f0391bdc97e9e182c0e9897814a0518fa..cb639544872439b8595789b3df30cdfe49eb5ef0 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
@@ -6,15 +6,15 @@ import AVFoundation
 
 @available(iOS 10.0, *)
 @objc public protocol VideoCaptureDelegate: NSObjectProtocol {
-  @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
     @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
     @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
     @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
 }
 
 /**
-  Simple interface to the iPhone's camera.
-*/
+ Simple interface to the iPhone's camera.
+ */
 @available(iOS 10.0, *)
 public class VideoCapture: NSObject {
     public var previewLayer: AVCaptureVideoPreviewLayer?
@@ -35,9 +35,9 @@ public class VideoCapture: NSObject {
         self.cameraPosition = position
         super.init()
     }
-
+    
     public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
-                    completion: @escaping (Bool) -> Void) {
+                      completion: @escaping (Bool) -> Void) {
         queue.async {
             let success = self.setUpCamera(sessionPreset: sessionPreset)
             DispatchQueue.main.async {
@@ -45,7 +45,7 @@ public class VideoCapture: NSObject {
             }
         }
     }
-
+    
     func fontCamera() -> AVCaptureDevice? {
         let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
         return deveices.first
@@ -62,7 +62,7 @@ public class VideoCapture: NSObject {
         
         captureSession.beginConfiguration()
         captureSession.sessionPreset = sessionPreset
-
+        
         var oCaptureDevice: AVCaptureDevice?
         switch cameraPosition {
         case .back:
@@ -79,56 +79,56 @@ public class VideoCapture: NSObject {
             print("Error: no video devices available")
             return false
         }
-
+        
         guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
             print("Error: could not create AVCaptureDeviceInput")
             return false
         }
-
+        
         if captureSession.canAddInput(videoInput) {
             captureSession.addInput(videoInput)
         }
-
+        
         let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
         previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
         previewLayer.connection?.videoOrientation = self.videoOrientation
         self.previewLayer = previewLayer
-
+        
         let settings: [String : Any] = [
-        kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
+            kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
         ]
-
+        
         videoOutput.videoSettings = settings
         videoOutput.alwaysDiscardsLateVideoFrames = true
         videoOutput.setSampleBufferDelegate(self, queue: queue)
         if captureSession.canAddOutput(videoOutput) {
             captureSession.addOutput(videoOutput)
         }
-
+        
         // We want the buffers to be in portrait orientation otherwise they are
         // rotated by 90 degrees. Need to set this _after_ addOutput()!
         videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
-
+        
         if captureSession.canAddOutput(photoOutput) {
             captureSession.addOutput(photoOutput)
         }
-
+        
         captureSession.commitConfiguration()
         return true
     }
-
+    
     public func start() {
         if !captureSession.isRunning {
             captureSession.startRunning()
         }
     }
-
+    
     public func stop() {
         if captureSession.isRunning {
             captureSession.stopRunning()
         }
     }
-
+    
     /* Captures a single frame of the camera input. */
     public func capturePhoto() {
         let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
@@ -139,7 +139,7 @@ public class VideoCapture: NSObject {
         ]
         photoOutput.capturePhoto(with: settings, delegate: self)
     }
-
+    
     func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
         if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
             let width = CVPixelBufferGetWidth(imageBuffer)
@@ -152,7 +152,7 @@ public class VideoCapture: NSObject {
         }
         return nil
     }
-
+    
     func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
         if let sampleBuffer = sampleBuffer,
             let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
@@ -172,47 +172,47 @@ public class VideoCapture: NSObject {
 
 @available(iOS 10.0, *)
 extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
-  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
-    // Because lowering the capture device's FPS looks ugly in the preview,
-    // we capture at full speed but only call the delegate at its desired
-    // framerate. If `fps` is -1, we run at the full framerate.
-    let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
-    let deltaTime = timestamp - lastTimestamp
-    if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
-        lastTimestamp = timestamp
-        self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
-        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
-            let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
-            delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+    public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+        // Because lowering the capture device's FPS looks ugly in the preview,
+        // we capture at full speed but only call the delegate at its desired
+        // framerate. If `fps` is -1, we run at the full framerate.
+        let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+        let deltaTime = timestamp - lastTimestamp
+        if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
+            lastTimestamp = timestamp
+            self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
+            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
+                let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
+                delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+            }
         }
     }
-  }
-
-  public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
-    print("dropped frame")
-  }
+    
+    public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+        print("dropped frame")
+    }
 }
 
 @available(iOS 10.0, *)
 extension VideoCapture: AVCapturePhotoCaptureDelegate {
-  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
-                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
-                          previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
-                          resolvedSettings: AVCaptureResolvedPhotoSettings,
-                          bracketSettings: AVCaptureBracketedStillImageSettings?,
-                          error: Error?) {
-    var imageTexture: MTLTexture?
-    var previewImage: UIImage?
-    if error == nil {
-        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
-            imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
-            self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
-        }
-        
-        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
-            previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
-            self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+    public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
+                            didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
+                            previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
+                            resolvedSettings: AVCaptureResolvedPhotoSettings,
+                            bracketSettings: AVCaptureBracketedStillImageSettings?,
+                            error: Error?) {
+        var imageTexture: MTLTexture?
+        var previewImage: UIImage?
+        if error == nil {
+            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
+                imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
+                self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
+            }
+            
+            if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
+                previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
+                self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+            }
         }
     }
-  }
 }
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
index 612a986d85994a03f45b3f6641e1851784d35787..42d6c2b7abb57bcfd3734e70befcb942cf1f5dcd 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -19,265 +19,242 @@ import paddle_mobile
 import MetalPerformanceShaders
 
 class FileReader {
-  let file: UnsafeMutablePointer<FILE>
-  let fileSize: Int
-  init(paramPath: String) throws {
-    guard let tmpFile = fopen(paramPath, "rb") else {
-      throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+    let file: UnsafeMutablePointer<FILE>
+    let fileSize: Int
+    init(paramPath: String) throws {
+        guard let tmpFile = fopen(paramPath, "rb") else {
+            throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+        }
+        file = tmpFile
+        fseek(file, 0, SEEK_END)
+        fileSize = ftell(file)
+        guard fileSize > 0 else {
+            throw PaddleMobileError.loaderError(message: "param file size is too small")
+        }
+        rewind(file)
+    }
+    
+    func read<T>() -> UnsafeMutablePointer<T> {
+        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size * fileSize)
+        fread(ptr, fileSize, 1, file)
+        return ptr
     }
-    file = tmpFile
-    fseek(file, 0, SEEK_END)
-    fileSize = ftell(file)
-    guard fileSize > 0 else {
-      throw PaddleMobileError.loaderError(message: "param file size is too small")
+    
+    deinit {
+        fclose(file)
     }
-    rewind(file)
-  }
-  
-  func read<T>() -> UnsafeMutablePointer<T> {
-    let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size * fileSize)
-    fread(ptr, fileSize, 1, file)
-    return ptr
-  }
-  
-  deinit {
-    fclose(file)
-  }
 }
 
 enum Platform {
-  case GPU
+    case GPU
 }
 
 let platformSupport: [(Platform, String)] = [(.GPU, "GPU")]
 
 enum SupportModel: String{
-  case yolo               = "yolo"
-  case mobilenet_combined = "mobilenet_combined"
-  case super_resolution   = "superresoltion"
-  case mobilenet          = "mobilenet"
-  
-  static func supportedModels() -> [SupportModel] {
-    return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet]
-  }
+    case yolo               = "yolo"
+    case mobilenet_combined = "mobilenet_combined"
+    case super_resolution   = "superresoltion"
+    case mobilenet          = "mobilenet"
+    
+    static func supportedModels() -> [SupportModel] {
+        return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet]
+    }
 }
 
 let netSupport: [SupportModel : Net] = [
-  .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device),
-  .yolo : YoloNet.init(device: MetalHelper.shared.device),
-  .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device),
-  .mobilenet : MobileNet.init(device: MetalHelper.shared.device)]
+    .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device),
+    .yolo : YoloNet.init(device: MetalHelper.shared.device),
+    .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device),
+    .mobilenet : MobileNet.init(device: MetalHelper.shared.device)]
 
 class ViewController: UIViewController {
-  @IBOutlet weak var resultTextView: UITextView!
-  @IBOutlet weak var selectImageView: UIImageView!
-  @IBOutlet weak var elapsedTimeLabel: UILabel!
-  @IBOutlet weak var modelPickerView: UIPickerView!
-  @IBOutlet weak var threadPickerView: UIPickerView!
-  @IBOutlet weak var videoView: UIView!
-  //  var videoCapture: VideoCapture!
-  
-  var selectImage: UIImage?
-  var inputPointer: UnsafeMutablePointer<Float32>?
-  var modelType: SupportModel = SupportModel.supportedModels()[0]
-  var toPredictTexture: MTLTexture?
-  
-  var runner: Runner!
-  var platform: Platform = .GPU
-  var threadNum = 1
-  
-  @IBAction func loadAct(_ sender: Any) {
-    runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue)
-    if platform == .GPU {
-//      let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil)
-//      let fileReader = try! FileReader.init(paramPath: filePath!)
-//      let pointer: UnsafeMutablePointer<Float32> = fileReader.read()
-//      
-//      
-//      let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared)
-//      
-//      buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize)
-      
-      
-      if self.toPredictTexture == nil {
-        
-//        runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in
-//          self?.toPredictTexture = texture
-//        }
+    @IBOutlet weak var resultTextView: UITextView!
+    @IBOutlet weak var selectImageView: UIImageView!
+    @IBOutlet weak var elapsedTimeLabel: UILabel!
+    @IBOutlet weak var modelPickerView: UIPickerView!
+    @IBOutlet weak var threadPickerView: UIPickerView!
+    @IBOutlet weak var videoView: UIView!
+    //  var videoCapture: VideoCapture!
+    
+    var selectImage: UIImage?
+    var inputPointer: UnsafeMutablePointer<Float32>?
+    var modelType: SupportModel = SupportModel.supportedModels()[0]
+    var toPredictTexture: MTLTexture?
+    
+    var runner: Runner!
+    var platform: Platform = .GPU
+    var threadNum = 1
+    
+    @IBAction func loadAct(_ sender: Any) {
+        runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue)
+        if platform == .GPU {
+            //      let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil)
+            //      let fileReader = try! FileReader.init(paramPath: filePath!)
+            //      let pointer: UnsafeMutablePointer<Float32> = fileReader.read()
+            //      
+            //      
+            //      let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared)
+            //      
+            //      buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize)
+            
+            
+            if self.toPredictTexture == nil {
+                
+                //        runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in
+                //          self?.toPredictTexture = texture
+                //        }
+                
+                runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in
+                    self?.toPredictTexture = texture
+                }
+            }
+        } else {
+            fatalError( " unsupport " )
+        }
         
-        runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in
-          self?.toPredictTexture = texture
+        if runner.load() {
+            print(" load success ! ")
+        } else {
+            print(" load error ! ")
         }
-      }
-    } else {
-      fatalError( " unsupport " )
     }
     
-    if runner.load() {
-      print(" load success ! ")
-    } else {
-      print(" load error ! ")
+    @IBAction func selectImageAct(_ sender: Any) {
+        let imagePicker = UIImagePickerController()
+        imagePicker.sourceType = .camera
+        imagePicker.delegate = self
+        self.present(imagePicker, animated: true, completion: nil)
     }
-  }
-  
-  @IBAction func selectImageAct(_ sender: Any) {
-    let imagePicker = UIImagePickerController()
-    imagePicker.sourceType = .camera
-    imagePicker.delegate = self
-    self.present(imagePicker, animated: true, completion: nil)
-  }
-  
-  @IBAction func clearAct(_ sender: Any) {
-    runner.clear()
-  }
-  
-  @IBAction func predictAct(_ sender: Any) {
-    let max = 1
-    switch platform {
-    case .GPU:
-      guard let inTexture = toPredictTexture else {
-        resultTextView.text = "请选择图片 ! "
-        return
-      }
-      
-      let startDate = Date.init()
-      for i in 0..<max {
-        self.runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
-          guard let sSelf = self else {
-            fatalError()
-          }
-          
-          if success, let inResultHolderArr = resultHolder {
-            let inResultHolder = inResultHolderArr[0]
-            if i == max - 1 {
-              let time = Date.init().timeIntervalSince(startDate)
+    
+    @IBAction func clearAct(_ sender: Any) {
+        runner.clear()
+    }
+    
+    @IBAction func predictAct(_ sender: Any) {
+        let max = 1
+        switch platform {
+        case .GPU:
+            guard let inTexture = toPredictTexture else {
+                resultTextView.text = "请选择图片 ! "
+                return
+            }
             
-              print(inResultHolder.result.floatArr(count: inResultHolder.capacity).strideArray())
-              DispatchQueue.main.async {
-                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
-                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
-              }
+            let startDate = Date.init()
+            for i in 0..<max {
+                self.runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
+                    guard let sSelf = self else {
+                        fatalError()
+                    }
+                    
+                    if success, let inResultHolderArr = resultHolder {
+                        let inResultHolder = inResultHolderArr[0]
+                        if i == max - 1 {
+                            let time = Date.init().timeIntervalSince(startDate)
+                            
+                            print(inResultHolder.result.floatArr(count: inResultHolder.capacity).strideArray())
+                            DispatchQueue.main.async {
+                                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
+                                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+                            }
+                        }
+                    }
+                    
+                    DispatchQueue.main.async {
+                        resultHolder?.first?.releasePointer()
+                    }
+                }
             }
-          }
-          
-          DispatchQueue.main.async {
-            resultHolder?.first?.releasePointer()
-          }
         }
-      }
     }
-  }
-  
-  override func viewDidLoad() {
-    super.viewDidLoad()
     
-    modelPickerView.delegate = self
-    modelPickerView.dataSource = self
-    threadPickerView.delegate = self
-    threadPickerView.dataSource = self
-    if let image = UIImage.init(named: "classify-img-output.png") {
-      selectImage = image
-      selectImageView.image = image
-    } else {
-      print("请添加测试图片")
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        
+        GlobalConfig.shared.computePrecision = .Float16
+        GlobalConfig.shared.debug = false
+        
+        modelPickerView.delegate = self
+        modelPickerView.dataSource = self
+        threadPickerView.delegate = self
+        threadPickerView.dataSource = self
+        if let image = UIImage.init(named: "00001.jpg") {
+            selectImage = image
+            selectImageView.image = image
+        } else {
+            print("请添加测试图片")
+        }
     }
-    
-    GlobalConfig.shared.computePrecision = .Float32
-    
-    //    if platform == .CPU {
-    //      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
-    //    } else if platform == .GPU {
-    //      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
-    //        self?.toPredictTexture = texture
-    //      }
-    //    } else {
-    //      fatalError( " unsupport " )
-    //    }
-    
-    //    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
-    //    videoCapture.fps = 30
-    //    videoCapture.delegate = self
-    //    videoCapture.setUp { (success) in
-    //      DispatchQueue.main.async {
-    //        if let preViewLayer = self.videoCapture.previewLayer {
-    //          self.videoView.layer.addSublayer(preViewLayer)
-    //          self.videoCapture.previewLayer?.frame = self.videoView.bounds
-    //        }
-    //        self.videoCapture.start()
-    //      }
-    //    }
-    
-  }
 }
 
 extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
-  func numberOfComponents(in pickerView: UIPickerView) -> Int {
-    if pickerView == modelPickerView {
-      return 1
-    } else if pickerView == threadPickerView {
-      return 1
-    } else {
-      fatalError()
+    func numberOfComponents(in pickerView: UIPickerView) -> Int {
+        if pickerView == modelPickerView {
+            return 1
+        } else if pickerView == threadPickerView {
+            return 1
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
-    if pickerView == modelPickerView {
-      return SupportModel.supportedModels().count
-    } else if pickerView == threadPickerView {
-      return platformSupport.count
-    } else {
-      fatalError()
+    
+    func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
+        if pickerView == modelPickerView {
+            return SupportModel.supportedModels().count
+        } else if pickerView == threadPickerView {
+            return platformSupport.count
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
-    if pickerView == modelPickerView {
-      return SupportModel.supportedModels()[row].rawValue
-    } else if pickerView == threadPickerView {
-      return platformSupport[row].1
-    } else {
-      fatalError()
+    
+    public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
+        if pickerView == modelPickerView {
+            return SupportModel.supportedModels()[row].rawValue
+        } else if pickerView == threadPickerView {
+            return platformSupport[row].1
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
-    if pickerView == modelPickerView {
-      self.modelType = SupportModel.supportedModels()[row]
-    } else if pickerView == threadPickerView {
-      platform = platformSupport[row].0
-    } else {
-      fatalError()
+    
+    public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
+        if pickerView == modelPickerView {
+            self.modelType = SupportModel.supportedModels()[row]
+        } else if pickerView == threadPickerView {
+            platform = platformSupport[row].0
+        } else {
+            fatalError()
+        }
     }
-  }
 }
 
 extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-    picker.dismiss(animated: true){[weak self] in
-      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
-        fatalError("no image")
-      }
-      sSelf.selectImage = image
-      sSelf.selectImageView.image = image
-      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
-        sSelf.toPredictTexture = texture
-      })
+    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+        picker.dismiss(animated: true){[weak self] in
+            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
+                fatalError("no image")
+            }
+            sSelf.selectImage = image
+            sSelf.selectImageView.image = image
+            sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+                sSelf.toPredictTexture = texture
+            })
+        }
     }
-  }
 }
 
 var bool1 = false
 extension ViewController: VideoCaptureDelegate{
-  func predictTexture(texture: MTLTexture){
-    runner.scaleTexture(input: texture) { (scaledTexture) in
-      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
-        //        print(resultHolder!.result![0])
-        resultHolder?.first?.releasePointer()
-      })
+    func predictTexture(texture: MTLTexture){
+        runner.scaleTexture(input: texture) { (scaledTexture) in
+            self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
+                //        print(resultHolder!.result![0])
+                resultHolder?.first?.releasePointer()
+            })
+        }
     }
-  }
-
+    
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
index 5b7b65da7c8b838eebcfb24873d5f073303b8b1e..007fd5e42982539dd9872cce4f2cead5727fde8d 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj
@@ -326,9 +326,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
+				MTL_LANGUAGE_REVISION = Metal12;
 				SDKROOT = iphoneos;
 			};
 			name = Debug;
@@ -337,9 +338,10 @@
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
-				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
+				MTL_LANGUAGE_REVISION = Metal12;
 				SDKROOT = iphoneos;
 			};
 			name = Release;
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
index 96333a07a9669ecb2b5bfe901d71be729e37b533..ab1dcfae6813ddef860158bc9fd638d26dfb4f8a 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal
@@ -20,23 +20,23 @@ kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0
                       const device float4 * nscale [[buffer(0)]],
                       const device float4 * nbias [[buffer(1)]],
                       uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  float4 output = input * nscale[gid.z] + nbias[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    float4 output = input * nscale[gid.z] + nbias[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      const device half4 * newScale [[buffer(0)]],
-                      const device half4 * newBias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  half4 output = input * newScale[gid.z] + newBias[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                           const device half4 * newScale [[buffer(0)]],
+                           const device half4 * newBias [[buffer(1)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    half4 output = input * newScale[gid.z] + newBias[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
index eb94408c8ac664be5cf62bc28bfb02825856ebd4..98ba10d8472086e85ddf62349d56a85c910dd312 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
@@ -15,10 +15,10 @@ struct MetalConvParam {
 };
 
 kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         const device float4 *new_scale [[buffer(0)]],
-                                         const device float4 *new_biase [[buffer(1)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
+                                texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                const device float4 *new_scale [[buffer(0)]],
+                                const device float4 *new_biase [[buffer(1)]],
+                                uint3 gid [[thread_position_in_grid]]) {
     
     if (gid.x >= outTexture.get_width() ||
         gid.y >= outTexture.get_height() ||
@@ -32,5 +32,5 @@ kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture
     input = inTexture.sample(sample, gid.x, gid.y, gid.z);
     output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
     outTexture.write(output, gid.xy, gid.z);
-
+    
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
index a590f8089890f2fab1af4c1f736f3bfc5708aecf..188c31019d98ae396bf8dcc605402529164e1dbe 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal
@@ -21,29 +21,29 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                     texture2d_array<P, access::write> output [[texture(1)]],
-                     constant bilinear_interp_param & pm [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    P w = gid.x * pm.ratio_w;
-    P h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    P w1lambda = w - w0, h1lambda = h - h0;
-    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
-    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
-    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
-    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
-      + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
+                                     texture2d_array<P, access::write> output [[texture(1)]],
+                                     constant bilinear_interp_param & pm [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+    VECTOR(P, 4) r;
+    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+        r = input.read(gid.xy, gid.z);
+    } else {
+        P w = gid.x * pm.ratio_w;
+        P h = gid.y * pm.ratio_h;
+        uint w0 = w, h0 = h;
+        uint w1 = w0 + 1, h1 = h0 + 1;
+        P w1lambda = w - w0, h1lambda = h - h0;
+        P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+        if (w1 >= input.get_width()) w1 = w0;
+            if (h1 >= input.get_height()) h1 = h0;
+                VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
+                VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
+                VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
+                VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
+                r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+                + h1lambda * (w2lambda * r2 + w1lambda * r3);
+                }
+    output.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
index 394cf89db09d47b0d3c87ff124c21a93962c0972..6104abb01d459a7e258e4104f17bba9b4e23424c 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal
@@ -16,8 +16,8 @@
 using namespace metal;
 
 struct bilinear_interp_param {
-  float ratio_h;
-  float ratio_w;
+    float ratio_h;
+    float ratio_w;
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
index 918fbac1a713d7b0442a1eb1f07abea3616bec96..184ee2bb71189fa5e89e3d0c18901ea2b70e8d8e 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal
@@ -20,35 +20,35 @@
 #define FUNC(f, p) CONCAT2_(f, p)
 #define VECTOR(p, n) CONCAT2(p, n)
 kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
-                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
-                     texture2d_array<P, access::read> targetBox [[texture(2)]],
-                     texture2d_array<P, access::write> output[[texture(3)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
-  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
-  VECTOR(P, 4) t;
-  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
-  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
-  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
-  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
-  
-  P px = (p.x + p.z) / 2;
-  P py = (p.y + p.w) / 2;
-  P pw = p.z - p.x;
-  P ph = p.w - p.y;
-  
-  P tx = pv.x * t.x * pw + px;
-  P ty = pv.y * t.y * ph + py;
-  P tw = exp(pv.z * t.z) * pw;
-  P th = exp(pv.w * t.w) * ph;
-  
-  VECTOR(P, 4) r;
-  r.x = tx - tw / 2;
-  r.y = ty - th / 2;
-  r.z = tx + tw / 2;
-  r.w = ty + th / 2;
-
-  output.write(r, gid.xy, gid.z);
+                              texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
+                              texture2d_array<P, access::read> targetBox [[texture(2)]],
+                              texture2d_array<P, access::write> output[[texture(3)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
+    VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
+    VECTOR(P, 4) t;
+    t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
+    t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
+    t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
+    t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
+    
+    P px = (p.x + p.z) / 2;
+    P py = (p.y + p.w) / 2;
+    P pw = p.z - p.x;
+    P ph = p.w - p.y;
+    
+    P tx = pv.x * t.x * pw + px;
+    P ty = pv.y * t.y * ph + py;
+    P tw = exp(pv.z * t.z) * pw;
+    P th = exp(pv.w * t.w) * ph;
+    
+    VECTOR(P, 4) r;
+    r.x = tx - tw / 2;
+    r.y = ty - th / 2;
+    r.z = tx + tw / 2;
+    r.w = ty + th / 2;
+    
+    output.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
index 3c07872616bb7c2f130d92247feeeeaa60ece21e..12450f574159cb7030c8e902cc3535d1dda1b864 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal
@@ -13,24 +13,24 @@ kernel void buffer_to_texture_kernel(
                                      const device float *input [[buffer(0)]],
                                      texture2d<float, access::write> outTexture [[texture(0)]],
                                      uint2 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  
-  float y = input[outTexture.get_width() * gid.y + gid.x];
-  outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    
+    float y = input[outTexture.get_width() * gid.y + gid.x];
+    outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
 }
 
 kernel void buffer_to_texture_kernel_half(const device float *input [[buffer(0)]],
                                           texture2d<half, access::write> outTexture [[texture(0)]],
                                           uint2 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) {
-    return;
-  }
-  
-  float y = input[outTexture.get_width() * gid.y + gid.x];
-  outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    
+    float y = input[outTexture.get_width() * gid.y + gid.x];
+    outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
index 40bae035c097b5ab386d78520b6b04f074eb2fee..099b8ca77cb10a81ffd8e2e026d1058c0954bd97 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
@@ -17,104 +17,104 @@ using namespace metal;
 
 
 inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = abcd[2] = 0;
-  abcd[3] = xyzn[0] * 4 + xyzn[3];
+    abcd[0] = abcd[1] = abcd[2] = 0;
+    abcd[3] = xyzn[0] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
-  abcd[0] = abcd[1] = 0;
-  abcd[2] = xyzn[1];
-  abcd[3] = xyzn[0] * 4 + xyzn[3];
+    abcd[0] = abcd[1] = 0;
+    abcd[2] = xyzn[1];
+    abcd[3] = xyzn[0] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
-  abcd[0] = 0;
-  abcd[3] = xyzn[0];
-  abcd[2] = xyzn[1];
-  abcd[1] = xyzn[2] * 4 + xyzn[3];
+    abcd[0] = 0;
+    abcd[3] = xyzn[0];
+    abcd[2] = xyzn[1];
+    abcd[1] = xyzn[2] * 4 + xyzn[3];
 }
 inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
-  abcd[2] = xyzn[0];
-  abcd[1] = xyzn[1];
-  uint t = xyzn[2] * 4 + xyzn[3];
-  abcd[0] = t / C;
-  abcd[3] = t % C;
+    abcd[2] = xyzn[0];
+    abcd[1] = xyzn[1];
+    uint t = xyzn[2] * 4 + xyzn[3];
+    abcd[0] = t / C;
+    abcd[3] = t % C;
 }
 
 inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
-  xyzn[1] = xyzn[2] = 0;
-  xyzn[0] = abcd[3] / 4;
-  xyzn[1] = abcd[3] % 4;
+    xyzn[1] = xyzn[2] = 0;
+    xyzn[0] = abcd[3] / 4;
+    xyzn[1] = abcd[3] % 4;
 }
 inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
-  xyzn[2] = 0;
-  xyzn[1] = abcd[2];
-  xyzn[0] = abcd[3] / 4;
-  xyzn[3] = abcd[3] % 4;
+    xyzn[2] = 0;
+    xyzn[1] = abcd[2];
+    xyzn[0] = abcd[3] / 4;
+    xyzn[3] = abcd[3] % 4;
 }
 inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[3];
-  xyzn[1] = abcd[2];
-  xyzn[2] = abcd[1] / 4;
-  xyzn[3] = abcd[1] % 4;
+    xyzn[0] = abcd[3];
+    xyzn[1] = abcd[2];
+    xyzn[2] = abcd[1] / 4;
+    xyzn[3] = abcd[1] % 4;
 }
 inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[2];
-  xyzn[1] = abcd[1];
-  uint t = abcd[0] * C + abcd[3];
-  xyzn[2] = t / 4;
-  xyzn[3] = t % 4;
+    xyzn[0] = abcd[2];
+    xyzn[1] = abcd[1];
+    uint t = abcd[0] * C + abcd[3];
+    xyzn[2] = t / 4;
+    xyzn[3] = t % 4;
 }
 
 inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
-  abcd[2] = xyzn[0];
-  abcd[1] = xyzn[1];
-  uint t = xyzn[2] * 4 + xyzn[3];
-  abcd[0] = t / C;
-  abcd[3] = t % C;
+    abcd[2] = xyzn[0];
+    abcd[1] = xyzn[1];
+    uint t = xyzn[2] * 4 + xyzn[3];
+    abcd[0] = t / C;
+    abcd[3] = t % C;
 }
 
 inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
-  xyzn[0] = abcd[2];
-  xyzn[1] = abcd[1];
-  uint t = abcd[0] * C + abcd[3];
-  xyzn[2] = t / 4;
-  xyzn[3] = t % 4;
+    xyzn[0] = abcd[2];
+    xyzn[1] = abcd[1];
+    uint t = abcd[0] * C + abcd[3];
+    xyzn[2] = t / 4;
+    xyzn[3] = t % 4;
 }
 
 inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
-  int32_t r = abcd[0];
-  r = r * dim[1] + abcd[1];
-  r = r * dim[2] + abcd[2];
-  r = r * dim[3] + abcd[3];
-  return r;
+    int32_t r = abcd[0];
+    r = r * dim[1] + abcd[1];
+    r = r * dim[2] + abcd[2];
+    r = r * dim[3] + abcd[3];
+    return r;
 }
 
 inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
-  abcd[3] = ind % dim[3]; ind /= dim[3];
-  abcd[2] = ind % dim[2]; ind /= dim[2];
-  abcd[1] = ind % dim[1]; ind /= dim[1];
-  abcd[0] = ind;
+    abcd[3] = ind % dim[3]; ind /= dim[3];
+    abcd[2] = ind % dim[2]; ind /= dim[2];
+    abcd[1] = ind % dim[1]; ind /= dim[1];
+    abcd[0] = ind;
 }
 
 inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-  for (int i = 0; i < 4; i++) {
-    opos[i] = ipos[trans[i]];
-  }
+    for (int i = 0; i < 4; i++) {
+        opos[i] = ipos[trans[i]];
+    }
 }
 
 inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
-  for (int i = 0; i < 4; i++) {
-    opos[trans[i]] = ipos[i];
-  }
+    for (int i = 0; i < 4; i++) {
+        opos[trans[i]] = ipos[i];
+    }
 }
 
 
 struct MetalConvParam {
-  short offsetX;
-  short offsetY;
-  short offsetZ;
-  ushort strideX;
-  ushort strideY;
-  ushort dilationX;
-  ushort dilationY;
+    short offsetX;
+    short offsetY;
+    short offsetZ;
+    ushort strideX;
+    ushort strideY;
+    ushort dilationX;
+    ushort dilationY;
 };
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
index 2b070fc48b78391e96b93823eeff7f936de2ff7d..ff8bd3d7a39dd89186b1d3683fbf59d9f89e4ae5 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal
@@ -42,73 +42,73 @@
 //                                     uint3 gid [[thread_position_in_grid]]) {
 //}
 kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
 #if N >= 3
-                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
 #endif
 #if N >= 4
-                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
 #endif
 #if N >= 5
-                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
 #endif
 #if N >= 6
-                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
 #endif
-                                          texture2d_array<P, access::read> inx [[texture(N)]],
-                                          texture2d_array<P, access::write> out [[texture(N+1)]],
-                                          constant ConcatParam & pm [[buffer(0)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-
-   ConcatParam cp = pm;
-   int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
-   VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
-   for (int i = 0; i < 4; i++) {
-     xyzn[3] = i;
+                                      texture2d_array<P, access::read> inx [[texture(N)]],
+                                      texture2d_array<P, access::write> out [[texture(N+1)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+    
+    ConcatParam cp = pm;
+    int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+    VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
+    for (int i = 0; i < 4; i++) {
+        xyzn[3] = i;
 #if R == 4
-     xyzn2abcd_4(cp.odim[3], xyzn, abcd);
+        xyzn2abcd_4(cp.odim[3], xyzn, abcd);
 #else
-     FUNC_R(xyzn2abcd, R)(xyzn, abcd);
+        FUNC_R(xyzn2abcd, R)(xyzn, abcd);
 #endif
-     int k = abcd[cp.axis] - cp.offset;
-     if (k < 0) continue;
-     int j = 0;
-     for (; j < N; j++) {
-       if (k < cp.vdim[j]) {
-         break;
-       }
-       k -= cp.vdim[j];
-     }
-     if (j == N) {
-       continue;
-     }
-     int ta = cp.odim[cp.axis];
-     abcd[cp.axis] = k;
-     cp.odim[cp.axis] = cp.vdim[j];
+        int k = abcd[cp.axis] - cp.offset;
+        if (k < 0) continue;
+        int j = 0;
+        for (; j < N; j++) {
+            if (k < cp.vdim[j]) {
+                break;
+            }
+            k -= cp.vdim[j];
+        }
+        if (j == N) {
+            continue;
+        }
+        int ta = cp.odim[cp.axis];
+        abcd[cp.axis] = k;
+        cp.odim[cp.axis] = cp.vdim[j];
 #if R == 4
-     abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
+        abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
 #else
-     FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
+        FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
 #endif
-     cp.odim[cp.axis] = ta;
-     switch (j) {
-       case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-       case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+        cp.odim[cp.axis] = ta;
+        switch (j) {
+            case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #if N >= 3
-       case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
 #if N >= 4
-       case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
 #if N >= 5
-       case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
 #if N >= 6
-       case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+            case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
 #endif
-     }
-   }
-   out.write(r, gid.xy, gid.z);
+        }
+    }
+    out.write(r, gid.xy, gid.z);
 }
 
 #endif // V == NORMAL
@@ -117,66 +117,66 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
 
 #if V == VX
 kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
-                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
 #if N >= 3
-                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
 #endif // N >= 3
 #if N >= 4
-                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
 #endif // N >= 4
 #if N >= 5
-                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
 #endif // N >= 5
 #if N >= 6
-                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
 #endif // N >= 6
-                                          texture2d_array<P, access::write> out [[texture(N)]],
-                                          constant ConcatParam & pm [[buffer(0)]],
-                                          uint3 gid [[thread_position_in_grid]]) {
-  int x = gid.x - pm.offset;
-  if (x < 0) return;
-  if (x < pm.vdim[0]) {
-    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  x -= pm.vdim[0];
-  if (x < pm.vdim[1]) {
-    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+    int x = gid.x - pm.offset;
+    if (x < 0) return;
+    if (x < pm.vdim[0]) {
+        VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
+    x -= pm.vdim[0];
+    if (x < pm.vdim[1]) {
+        VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #if N >= 3
-  x -= pm.vdim[1];
-  if (x < pm.vdim[2]) {
-    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[1];
+    if (x < pm.vdim[2]) {
+        VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  x -= pm.vdim[2];
-  if (x < pm.vdim[3]) {
-    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[2];
+    if (x < pm.vdim[3]) {
+        VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 4
 #if N >= 5
-  x -= pm.vdim[3];
-  if (x < pm.vdim[4]) {
-    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[3];
+    if (x < pm.vdim[4]) {
+        VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 5
 #if N >= 6
-  x -= pm.vdim[4];
-  if (x < pm.vdim[5]) {
-    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    x -= pm.vdim[4];
+    if (x < pm.vdim[5]) {
+        VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 6
 }
 #endif // V == VX
@@ -199,50 +199,50 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
                                       texture2d_array<P, access::write> out [[texture(N)]],
                                       constant ConcatParam & pm [[buffer(0)]],
                                       uint3 gid [[thread_position_in_grid]]) {
-  int y = gid.y - pm.offset;
-  if (y < 0) return;
-  if (y < pm.vdim[0]) {
-    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  y -= pm.vdim[0];
-  if (y < pm.vdim[1]) {
-    VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    int y = gid.y - pm.offset;
+    if (y < 0) return;
+    if (y < pm.vdim[0]) {
+        VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
+    y -= pm.vdim[0];
+    if (y < pm.vdim[1]) {
+        VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #if N >= 3
-  y -= pm.vdim[1];
-  if (y < pm.vdim[2]) {
-    VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[1];
+    if (y < pm.vdim[2]) {
+        VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  y -= pm.vdim[2];
-  if (y < pm.vdim[3]) {
-    VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[2];
+    if (y < pm.vdim[3]) {
+        VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 4
 #if N >= 5
-  y -= pm.vdim[3];
-  if (y < pm.vdim[4]) {
-    VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[3];
+    if (y < pm.vdim[4]) {
+        VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 5
 #if N >= 6
-  y -= pm.vdim[4];
-  if (y < pm.vdim[5]) {
-    VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    y -= pm.vdim[4];
+    if (y < pm.vdim[5]) {
+        VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 6
 }
 #endif // V == VY
@@ -265,50 +265,50 @@ kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[tex
                                       texture2d_array<P, access::write> out [[texture(N)]],
                                       constant ConcatParam & pm [[buffer(0)]],
                                       uint3 gid [[thread_position_in_grid]]) {
-  int z = gid.z - pm.offset;
-  if (z < 0) return;
-  if (z < pm.vdim[0]) {
-    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
-  z -= pm.vdim[0];
-  if (z < pm.vdim[1]) {
-    VECTOR(P, 4)  r = in1.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    int z = gid.z - pm.offset;
+    if (z < 0) return;
+    if (z < pm.vdim[0]) {
+        VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
+    z -= pm.vdim[0];
+    if (z < pm.vdim[1]) {
+        VECTOR(P, 4)  r = in1.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #if N >= 3
-  z -= pm.vdim[1];
-  if (z < pm.vdim[2]) {
-    VECTOR(P, 4)  r = in2.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[1];
+    if (z < pm.vdim[2]) {
+        VECTOR(P, 4)  r = in2.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  z -= pm.vdim[2];
-  if (z < pm.vdim[3]) {
-    VECTOR(P, 4)  r = in3.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[2];
+    if (z < pm.vdim[3]) {
+        VECTOR(P, 4)  r = in3.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 4
 #if N >= 5
-  z -= pm.vdim[3];
-  if (z < pm.vdim[4]) {
-    VECTOR(P, 4)  r = in4.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[3];
+    if (z < pm.vdim[4]) {
+        VECTOR(P, 4)  r = in4.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 5
 #if N >= 6
-  z -= pm.vdim[4];
-  if (z < pm.vdim[5]) {
-    VECTOR(P, 4)  r = in5.read(gid.xy, z);
-    out.write(r, gid.xy, gid.z);
-    return;
-  }
+    z -= pm.vdim[4];
+    if (z < pm.vdim[5]) {
+        VECTOR(P, 4)  r = in5.read(gid.xy, z);
+        out.write(r, gid.xy, gid.z);
+        return;
+    }
 #endif // N >= 6
 }
 #endif // V == VZ
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
index b7d17f2d25de544e4ce938c577e0d04f536da9af..8a0390e624151bac1573e6727de04df0e2bb27de 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
@@ -18,11 +18,11 @@
 using namespace metal;
 
 struct ConcatParam {
-  int32_t odim[4];
-  int32_t axis;
-  int32_t offset;
-  int32_t trans[4];
-  int32_t vdim[6];
+    int32_t odim[4];
+    int32_t axis;
+    int32_t offset;
+    int32_t trans[4];
+    int32_t vdim[6];
 };
 
 #define VNORMAL 1
@@ -41,129 +41,129 @@ struct ConcatParam {
 
 // ssd-ar: (R=3, N=5, V=x)
 #define V VX
-  #define R 3
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 5
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 // ssd-ar: (R=2, N=5, V=x)
 #define V VX
-  #define R 2
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 5
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 // ssd-ar: (R=3, N=2, V=y)
 #define V VY
-  #define R 3
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 2
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 // ssd-ar: (R=4, N=3, V=z)
 #define V VZ
-  #define R 4
-    #define N 3
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 4
+#define N 3
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 // ssd: (R=2, N=6, V=y)
 #define V VY
-  #define R 2
-    #define N 6
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 6
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 // ssd: (R=3, N=6, V=y)
 #define V VY
-  #define R 3
-    #define N 6
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 6
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 #define V VNORMAL
-  #define R 4
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 4
+#define N 2
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 #define V VY
-  #define R 2
-    #define N 2
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 2
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 #define V VY
-  #define R 2
-    #define N 5
-      #define P float
-        #include "ConcatKernel.inc.metal"
-      #undef P
-      #define P half
-        #include "ConcatKernel.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 5
+#define P float
+#include "ConcatKernel.inc.metal"
+#undef P
+#define P half
+#include "ConcatKernel.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
index 87b60a64fc48ab89af274e0b24897e0b411599e0..f55386096f582b560abc4ea7c97945188afd1c9b 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal
@@ -18,147 +18,147 @@ using namespace metal;
 
 
 kernel void conv_add_batch_norm_relu_1x1_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half4 *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
+                                              texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device half4 *new_scale [[buffer(3)]],
+                                              const device half4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
     
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    float4 output = float4(0.0);
     
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_add_batch_norm_relu_3x3_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half4 *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                              texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device half4 *new_scale [[buffer(3)]],
+                                              const device half4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
-            texture2d_array<half, access::sample> inTexture [[texture(0)]],
-            texture2d_array<half, access::write> outTexture [[texture(1)]],
-            constant MetalConvParam &param [[buffer(0)]],
-            const device half *weights [[buffer(1)]],
-            const device half4 *biase [[buffer(2)]],
-            const device half4 *new_scale [[buffer(3)]],
-            const device half4 *new_biase [[buffer(4)]],
-            uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+                                                        texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                                        constant MetalConvParam &param [[buffer(0)]],
+                                                        const device half *weights [[buffer(1)]],
+                                                        const device half4 *biase [[buffer(2)]],
+                                                        const device half4 *new_scale [[buffer(3)]],
+                                                        const device half4 *new_biase [[buffer(4)]],
+                                                        uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 
@@ -175,41 +175,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
     
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    float4 output = float4(0.0);
     
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -220,50 +220,50 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -274,37 +274,37 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
                                                    const device float4 *new_scale [[buffer(3)]],
                                                    const device float4 *new_biase [[buffer(4)]],
                                                    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
index 274e416576743a473ba8931bcd538e9c39415f3c..e2513e1b1e86adf2f96dadb47e852d6ef38beb38 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
@@ -24,41 +24,41 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -67,66 +67,66 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 9;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  float4 input[9];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
     
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    float4 input[9];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -135,56 +135,56 @@ kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_y = param.dilationY;
-  float4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    ushort dilation_y = param.dilationY;
+    float4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
@@ -194,56 +194,56 @@ kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[text
                          const device float4 *weights [[buffer(1)]],
                          const device float4 *biase [[buffer(2)]],
                          uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  float4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    float4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
@@ -253,297 +253,297 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
                                    const device float *weights [[buffer(1)]],
                                    const device float4 *biase [[buffer(2)]],
                                    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = biase[gid.z];
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-//  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = biase[gid.z];
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 #pragma mark - half
 
 kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(biase[gid.z]);
+    
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i));
+        float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size  + i]);
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size  + i]);
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size  + i]);
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]);
+        output.w += dot(input, weight_w);
+    }
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
+    }
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                   constant MetalConvParam &param [[buffer(0)]],
-                                   const device half *weights [[buffer(1)]],
-                                   const device half4 *biase [[buffer(2)]],
-                                   uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  half4 output = biase[gid.z];
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                        constant MetalConvParam &param [[buffer(0)]],
+                                        const device half *weights [[buffer(1)]],
+                                        const device half4 *biase [[buffer(2)]],
+                                        uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    half4 output = biase[gid.z];
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_y = param.dilationY;
-  half4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = biase[gid.z];
+    
+    ushort dilation_y = param.dilationY;
+    half4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 5;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  half4 output = biase[gid.z];
-  
-  ushort dilation_x = param.dilationX;
-  half4 input[5];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-    
-    for (int j = 0; j < 5; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    half4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    half4 input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + float4(biase[gid.z]);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
@@ -553,69 +553,69 @@ kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [
                               const device float4 *weights [[buffer(1)]],
                               const device float4 *biase [[buffer(2)]],
                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  const uint kernelHXW = 9;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-  
-  float4 input[9];
-  
-  for (uint i = 0; i < input_arr_size; ++i) {
-    
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-    
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-    
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-    
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-    
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-    
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-    
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-    
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
     
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    float4 input[9];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  //  output = output + biase[gid.z];
-  outTexture.write(output, gid.xy, gid.z);
+    //  output = output + biase[gid.z];
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
index 069daa20e875eb00c0d518e0463987248ca8dce5..e2b8834cc5314897f04f485a012b88fc29e5054d 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal
@@ -19,428 +19,428 @@
 
 #pragma mark - convAdd
 kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-                         const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-                         uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  VECTOR(P, 4) output = biase[gid.z];
-  
-  VECTOR(P, 4) input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
-    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
-    
-    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
-    
-    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
-    
-    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  
-//  output = output + float4(biase[gid.z]);
-  
+                                                uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];
+    
+    VECTOR(P, 4) input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
+        VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    
+    //  output = output + float4(biase[gid.z]);
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-    texture2d_array<P, access::write> outTexture [[texture(1)]],
-    constant MetalConvParam &param [[buffer(0)]],
-    const device VECTOR(P, 4) *weights [[buffer(1)]],
-    const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-     const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-     uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 9;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];
-
-  ushort dilation_x = param.dilationX;
-  ushort dilation_y = param.dilationY;
-
-  VECTOR(P, 4) input[9];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-
-    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
-
-    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
-
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
-
-    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
-
-    for (int j = 0; j < 9; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                                uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    VECTOR(P, 4) input[9];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        
+        input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+        
+        input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+        
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+        
+        input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+        
+        for (int j = 0; j < 9; ++j) {
+            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-//  output = output + float4(biase[gid.z]);
-  
+    //  output = output + float4(biase[gid.z]);
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-                        const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-                         uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 5;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];;
-
-  ushort dilation_y = param.dilationY;
-  VECTOR(P, 4) input[5];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
-
-    for (int j = 0; j < 5; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                                uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];;
+    
+    ushort dilation_y = param.dilationY;
+    VECTOR(P, 4) input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 
 kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<P, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device VECTOR(P, 4) *weights [[buffer(1)]],
-                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                constant MetalConvParam &param [[buffer(0)]],
+                                                const device VECTOR(P, 4) *weights [[buffer(1)]],
+                                                const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-                         const device P *alpha [[buffer(3)]],
+                                                const device P *alpha [[buffer(3)]],
 #endif
-                         uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-
-  const uint kernelHXW = 5;
-
-  uint input_arr_size = inTexture.get_array_size();
-
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-
-  VECTOR(P, 4) output = biase[gid.z];
-
-  ushort dilation_x = param.dilationX;
-  VECTOR(P, 4) input[5];
-
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
-
-    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
-
-    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-
-    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
-
-    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
-
-    for (int j = 0; j < 5; ++j) {
-      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-
-      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-
-      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-
-      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+                                                uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 5;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    VECTOR(P, 4) output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    VECTOR(P, 4) input[5];
+    
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+        
+        input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+        
+        input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        
+        input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+        
+        input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+        
+        for (int j = 0; j < 5; ++j) {
+            VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
-    texture2d_array<P, access::write> outTexture [[texture(1)]],
-    constant MetalConvParam &param [[buffer(0)]],
-    const device P *weights [[buffer(1)]],
-    const device VECTOR(P, 4) *biase [[buffer(2)]],
+                                                          texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                                          constant MetalConvParam &param [[buffer(0)]],
+                                                          const device P *weights [[buffer(1)]],
+                                                          const device VECTOR(P, 4) *biase [[buffer(2)]],
 #ifdef PRELU_CHANNEL
-    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                          const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_ELEMENT
-    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+                                                          const device VECTOR(P, 4) *alpha [[buffer(3)]],
 #endif
 #ifdef PRELU_OTHER
-    const device P *alpha [[buffer(3)]],
+                                                          const device P *alpha [[buffer(3)]],
 #endif
-    uint3 gid [[thread_position_in_grid]]) {
-
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  VECTOR(P, 4) output = biase[gid.z];
-  VECTOR(P, 4) inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    VECTOR(P, 4) input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  
+                                                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    VECTOR(P, 4) output = biase[gid.z];
+    VECTOR(P, 4) inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        VECTOR(P, 4) input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+    outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
index f03a1d5b625cf01f1f1bc5ac23bebf7dabd968d9..407b8385b7a7b822df9151905f167c930c8670a9 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal
@@ -18,45 +18,45 @@ using namespace metal;
 
 #define P float
 
-  #define PRELU_CHANNEL prelu_channel
-  #define PRELU_TYPE prelu_channel
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_CHANNEL
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE prelu_channel
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
 
-  #define PRELU_ELEMENT prelu_element
-  #define PRELU_TYPE prelu_element
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_ELEMENT
+#define PRELU_ELEMENT prelu_element
+#define PRELU_TYPE prelu_element
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
 
-  #define PRELU_OTHER   prelu_other
-  #define PRELU_TYPE prelu_other
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_OTHER
+#define PRELU_OTHER   prelu_other
+#define PRELU_TYPE prelu_other
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
 
 #undef P
 
 #define P half
 
-  #define PRELU_CHANNEL prelu_channel
-  #define PRELU_TYPE prelu_channel
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_CHANNEL
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE prelu_channel
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
 
-  #define PRELU_ELEMENT prelu_element
-  #define PRELU_TYPE prelu_element
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_ELEMENT
+#define PRELU_ELEMENT prelu_element
+#define PRELU_TYPE prelu_element
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
 
-  #define PRELU_OTHER   prelu_other
-  #define PRELU_TYPE prelu_other
-    #include "ConvAddPrelu.inc.metal"
-  #undef  PRELU_TYPE
-  #undef  PRELU_OTHER
+#define PRELU_OTHER   prelu_other
+#define PRELU_TYPE prelu_other
+#include "ConvAddPrelu.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
 
 #undef P
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
index 4b97b7829a1fba27704fe7b60a03b2672f4f5953..6851f8aa98f49c405645e55d176ef921d2a1c0d2 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal
@@ -25,41 +25,41 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTe
                                      const device float4 *new_scale [[buffer(2)]],
                                      const device float4 *new_biase [[buffer(3)]],
                                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
     
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
     
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    float4 output = float4(0.0);
+    
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -69,50 +69,50 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTe
                                      const device float4 *new_scale [[buffer(2)]],
                                      const device float4 *new_biase [[buffer(3)]],
                                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
+    }
+    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -122,176 +122,176 @@ kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sa
                                                const device float4 *new_scale [[buffer(2)]],
                                                const device float4 *new_biase [[buffer(3)]],
                                                uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 #pragma mark - half
 kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device half4 *weights [[buffer(1)]],
-                                     const device half4 *new_scale [[buffer(2)]],
-                                     const device half4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(float4(input), float4(weight_x));
+                                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                          constant MetalConvParam &param [[buffer(0)]],
+                                          const device half4 *weights [[buffer(1)]],
+                                          const device half4 *new_scale [[buffer(2)]],
+                                          const device half4 *new_biase [[buffer(3)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
     
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(float4(input), float4(weight_y));
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(float4(input), float4(weight_z));
+    float4 output = float4(0.0);
     
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(float4(input), float4(weight_w));
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(float4(input), float4(weight_x));
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(float4(input), float4(weight_y));
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(float4(input), float4(weight_z));
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(float4(input), float4(weight_w));
+    }
+    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                     constant MetalConvParam &param [[buffer(0)]],
-                                     const device half4 *weights [[buffer(1)]],
-                                     const device half4 *new_scale [[buffer(2)]],
-                                     const device half4 *new_biase [[buffer(3)]],
-                                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
+                                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                          constant MetalConvParam &param [[buffer(0)]],
+                                          const device half4 *weights [[buffer(1)]],
+                                          const device half4 *new_scale [[buffer(2)]],
+                                          const device half4 *new_biase [[buffer(3)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
     }
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
+    }
+    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                               constant MetalConvParam &param [[buffer(0)]],
-                                               const device half *weights [[buffer(1)]],
-                                               const device half4 *new_scale [[buffer(2)]],
-                                               const device half4 *new_biase [[buffer(3)]],
-                                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(half4(output), gid.xy, gid.z);
+                                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                                    constant MetalConvParam &param [[buffer(0)]],
+                                                    const device half *weights [[buffer(1)]],
+                                                    const device half4 *new_scale [[buffer(2)]],
+                                                    const device half4 *new_biase [[buffer(3)]],
+                                                    uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
index c07515c13da54c7f8bf698f976e47f7cda6de32b..c7b3f792d69033eb608e55ec747bb086e501040b 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal
@@ -23,49 +23,49 @@ kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(
                      constant MetalConvParam &param [[buffer(0)]],
                      const device float4 *weights [[buffer(1)]],
                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(input[j], weight_x);
-      
-      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(input[j], weight_y);
-      
-      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(input[j], weight_z);
-      
-      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(input[j], weight_w);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    float4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(input[j], weight_x);
+            
+            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(input[j], weight_y);
+            
+            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(input[j], weight_z);
+            
+            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(input[j], weight_w);
+        }
     }
-  }
-  outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -73,37 +73,37 @@ kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture
                                constant MetalConvParam &param [[buffer(0)]],
                                const device float *weights [[buffer(1)]],
                                uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  float4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    float4 input = inputs[j];
-    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-  }
-  outTexture.write(output, gid.xy, gid.z);
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -111,170 +111,170 @@ kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(
                      constant MetalConvParam &param [[buffer(0)]],
                      const device float4 *weights [[buffer(1)]],
                      uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  float4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(input, weight_x);
     
-    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(input, weight_y);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
     
-    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(input, weight_z);
+    float4 output = float4(0.0);
     
-    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(input, weight_w);
-  }
-  outTexture.write(output, gid.xy, gid.z);
+    float4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(input, weight_x);
+        
+        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(input, weight_y);
+        
+        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(input, weight_z);
+        
+        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(input, weight_w);
+    }
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device half4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input[9];
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-    for (int j = 0; j < 9; ++j) {
-      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.x += dot(float4(input[j]), float4(weight_x));
-      
-      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.y += dot(float4(input[j]), float4(weight_y));
-      
-      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.z += dot(float4(input[j]), float4(weight_z));
-      
-      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          constant MetalConvParam &param [[buffer(0)]],
+                          const device half4 *weights [[buffer(1)]],
+                          uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input[9];
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+        for (int j = 0; j < 9; ++j) {
+            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.x += dot(float4(input[j]), float4(weight_x));
+            
+            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.y += dot(float4(input[j]), float4(weight_y));
+            
+            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.z += dot(float4(input[j]), float4(weight_z));
+            
+            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+            output.w += dot(float4(input[j]), float4(weight_w));
+        }
     }
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-                               constant MetalConvParam &param [[buffer(0)]],
-                               const device half *weights [[buffer(1)]],
-                               uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  uint output_slice = gid.z;
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 9;
-  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
-  half4 inputs[9];
-  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-  for (int j = 0; j < 9; ++j) {
-    half4 input = inputs[j];
-    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
-    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
-    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
-    output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
+                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                    constant MetalConvParam &param [[buffer(0)]],
+                                    const device half *weights [[buffer(1)]],
+                                    uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    half4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        half4 input = inputs[j];
+        output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+        output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+        output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+        output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+    }
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                     texture2d_array<half, access::write> outTexture [[texture(1)]],
-                     constant MetalConvParam &param [[buffer(0)]],
-                     const device half4 *weights [[buffer(1)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  ushort2 stride = ushort2(param.strideX, param.strideY);
-  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint kernelHXW = 1;
-  
-  uint input_arr_size = inTexture.get_array_size();
-  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  
-  float4 output = float4(0.0);
-  
-  half4 input;
-  for (uint i = 0; i < input_arr_size; ++i) {
-    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-    output.x += dot(float4(input), float4(weight_x));
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          constant MetalConvParam &param [[buffer(0)]],
+                          const device half4 *weights [[buffer(1)]],
+                          uint3 gid [[thread_position_in_grid]]) {
     
-    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-    output.y += dot(float4(input), float4(weight_y));
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
     
-    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-    output.z += dot(float4(input), float4(weight_z));
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
     
-    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-    output.w += dot(float4(input), float4(weight_w));
-  }
-  outTexture.write(half4(output), gid.xy, gid.z);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 1;
+    
+    uint input_arr_size = inTexture.get_array_size();
+    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    
+    float4 output = float4(0.0);
+    
+    half4 input;
+    for (uint i = 0; i < input_arr_size; ++i) {
+        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+        output.x += dot(float4(input), float4(weight_x));
+        
+        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+        output.y += dot(float4(input), float4(weight_y));
+        
+        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+        output.z += dot(float4(input), float4(weight_z));
+        
+        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+        output.w += dot(float4(input), float4(weight_w));
+    }
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
index baf3f31157a472412bb08ccb3c803f5ec9e25d9c..a324fac188051552c349dda76da644b39ff00dbf 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal
@@ -16,17 +16,17 @@
 using namespace metal;
 
 struct MetalConvTransposeParam{
-  ushort kernelW;
-  ushort kernelH;
-  
-  ushort strideX;
-  ushort strideY;
-  
-  ushort paddingX;
-  ushort paddingY;
-  
-  ushort dilationX;
-  ushort dilationY;
+    ushort kernelW;
+    ushort kernelH;
+    
+    ushort strideX;
+    ushort strideY;
+    
+    ushort paddingX;
+    ushort paddingY;
+    
+    ushort dilationX;
+    ushort dilationY;
 };
 
 kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -34,83 +34,83 @@ kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inT
                                       constant MetalConvTransposeParam &param [[buffer(0)]],
                                       const device float4 *weights [[buffer(1)]],
                                       uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_array_size = inTexture.get_array_size();
-  int kernel_index_x = gid.x % 2;
-  int kernel_index_y = gid.y % 2;
-  int kernel_index = kernel_index_y * 2 + kernel_index_x;
-  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-  int input_x = gid.x / 2;
-  int input_y = gid.y / 2;
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 output = float4(0.0);
-  for (int i = 0; i < input_array_size; ++i) {
-    
-    float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-    
-    float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-    float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-    float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-    float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-    
-    output.x += dot(input, kernel_slice0);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
     
-    output.y += dot(input, kernel_slice1);
+    int input_array_size = inTexture.get_array_size();
+    int kernel_index_x = gid.x % 2;
+    int kernel_index_y = gid.y % 2;
+    int kernel_index = kernel_index_y * 2 + kernel_index_x;
+    int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+    int input_x = gid.x / 2;
+    int input_y = gid.y / 2;
     
-    output.z += dot(input, kernel_slice2);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 output = float4(0.0);
+    for (int i = 0; i < input_array_size; ++i) {
+        
+        float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+        
+        float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+        float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+        float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+        float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+        
+        output.x += dot(input, kernel_slice0);
+        
+        output.y += dot(input, kernel_slice1);
+        
+        output.z += dot(input, kernel_slice2);
+        
+        output.w += dot(input, kernel_slice3);
+    }
     
-    output.w += dot(input, kernel_slice3);
-  }
-  
-  outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                      constant MetalConvTransposeParam &param [[buffer(0)]],
-                                      const device half4 *weights [[buffer(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_array_size = inTexture.get_array_size();
-  int kernel_index_x = gid.x % 2;
-  int kernel_index_y = gid.y % 2;
-  int kernel_index = kernel_index_y * 2 + kernel_index_x;
-  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
-  int input_x = gid.x / 2;
-  int input_y = gid.y / 2;
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 output = float4(0.0);
-  for (int i = 0; i < input_array_size; ++i) {
-    
-    half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
-    
-    half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
-    half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
-    half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
-    half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
-    
-    output.x += dot(float4(input), float4(kernel_slice0));
+                                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                           constant MetalConvTransposeParam &param [[buffer(0)]],
+                                           const device half4 *weights [[buffer(1)]],
+                                           uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
     
-    output.y += dot(float4(input), float4(kernel_slice1));
+    int input_array_size = inTexture.get_array_size();
+    int kernel_index_x = gid.x % 2;
+    int kernel_index_y = gid.y % 2;
+    int kernel_index = kernel_index_y * 2 + kernel_index_x;
+    int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+    int input_x = gid.x / 2;
+    int input_y = gid.y / 2;
     
-    output.z += dot(float4(input), float4(kernel_slice2));
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 output = float4(0.0);
+    for (int i = 0; i < input_array_size; ++i) {
+        
+        half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+        
+        half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+        half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+        half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+        half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+        
+        output.x += dot(float4(input), float4(kernel_slice0));
+        
+        output.y += dot(float4(input), float4(kernel_slice1));
+        
+        output.z += dot(float4(input), float4(kernel_slice2));
+        
+        output.w += dot(float4(input), float4(kernel_slice3));
+    }
     
-    output.w += dot(float4(input), float4(kernel_slice3));
-  }
-  
-  outTexture.write(half4(output), gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 
 //kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
index b152df828106acd96171a89f4f636f308e0e9e39..40cad28df130e2d826500cc840aaabf09d04e79b 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal
@@ -18,13 +18,13 @@
 using namespace metal;
 
 struct ElementwiseAddParam {
-  int32_t fast;
-  int32_t axis;
-  int32_t ylen;
-  int32_t xdim[4];
-  int32_t xtrans[4];
-  int32_t ydim[4];
-  int32_t ytrans[4];
+    int32_t fast;
+    int32_t axis;
+    int32_t ylen;
+    int32_t xdim[4];
+    int32_t xtrans[4];
+    int32_t ydim[4];
+    int32_t ytrans[4];
 };
 
 kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
@@ -32,69 +32,69 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
                             texture2d_array<float, access::write> outTexture [[texture(2)]],
                             constant ElementwiseAddParam &pm [[buffer(0)]],
                             uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  float4 rx, ry;
-
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
-  } else {
-    rx = inputX.read(gid.xy, gid.z);
-    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-    int32_t yshift = 4 - pm.ylen - pm.axis;
-    for (int n = 0; n < 4; n++) {
-      x_xyzn[3] = n;
-      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[yshift+k] = t_abcd[k];
-      }
-      trans(ytrans, y_abcd, t_abcd);
-      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    float4 rx, ry;
+    
+    if (pm.fast == 1) {
+        rx = inputX.read(gid.xy, gid.z);
+        ry = inputY.read(gid.xy, gid.z);
+    } else {
+        rx = inputX.read(gid.xy, gid.z);
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+        int32_t yshift = 4 - pm.ylen - pm.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+        }
     }
-  }
-  float4 r = rx + ry;
-  outTexture.write(r, gid.xy, gid.z);
+    float4 r = rx + ry;
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
-                            texture2d_array<half, access::read> inputY [[texture(1)]],
-                            texture2d_array<half, access::write> outTexture [[texture(2)]],
-                            constant ElementwiseAddParam &pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  half4 rx, ry;
-
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
-  } else {
-    rx = inputX.read(gid.xy, gid.z);
-    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-    int32_t yshift = 4 - pm.ylen - pm.axis;
-    for (int n = 0; n < 4; n++) {
-      x_xyzn[3] = n;
-      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-      invtrans(xtrans, x_abcd, t_abcd);
-      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-        y_abcd[yshift+k] = t_abcd[k];
-      }
-      trans(ytrans, y_abcd, t_abcd);
-      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+                                 texture2d_array<half, access::read> inputY [[texture(1)]],
+                                 texture2d_array<half, access::write> outTexture [[texture(2)]],
+                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    half4 rx, ry;
+    
+    if (pm.fast == 1) {
+        rx = inputX.read(gid.xy, gid.z);
+        ry = inputY.read(gid.xy, gid.z);
+    } else {
+        rx = inputX.read(gid.xy, gid.z);
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+        int32_t yshift = 4 - pm.ylen - pm.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+        }
     }
-  }
-  half4 r = rx + ry;
-  outTexture.write(r, gid.xy, gid.z);
+    half4 r = rx + ry;
+    outTexture.write(r, gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
index b1d68d680962c53778d624ab15bfcfeb1d1a3142..65566952efa5a30c8601e751cbfb0ac6ccf21464 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal
@@ -20,72 +20,72 @@
 using namespace metal;
 
 kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
-                                 texture2d_array<P, access::read> inputY [[texture(1)]],
-                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
-                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+                                                   texture2d_array<P, access::read> inputY [[texture(1)]],
+                                                   texture2d_array<P, access::write> outTexture [[texture(2)]],
+                                                   constant ElementwiseAddParam &pm [[buffer(0)]],
 #ifdef PRELU_CHANNEL
-                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+                                                   const device VECTOR(P, 4) *alpha [[buffer(1)]],
 #endif
 #ifdef PRELU_ELEMENT
-                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+                                                   const device VECTOR(P, 4) *alpha [[buffer(1)]],
 #endif
 #ifdef PRELU_OTHER
-                                 const device P *alpha [[buffer(1)]],
+                                                   const device P *alpha [[buffer(1)]],
 #endif
-                                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  VECTOR(P, 4) rx, ry;
-  
-  if (pm.fast == 1) {
-    rx = inputX.read(gid.xy, gid.z);
-    ry = inputY.read(gid.xy, gid.z);
+                                                   uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    VECTOR(P, 4) rx, ry;
+    
+    if (pm.fast == 1) {
+        rx = inputX.read(gid.xy, gid.z);
+        ry = inputY.read(gid.xy, gid.z);
     } else {
-      rx = inputX.read(gid.xy, gid.z);
-      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
-      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
-      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
-      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
-      int32_t yshift = 4 - pm.ylen - pm.axis;
-      for (int n = 0; n < 4; n++) {
-        x_xyzn[3] = n;
-        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
-        invtrans(xtrans, x_abcd, t_abcd);
-        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
-          y_abcd[yshift+k] = t_abcd[k];
+        rx = inputX.read(gid.xy, gid.z);
+        int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+        int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+        int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+        int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+        int32_t yshift = 4 - pm.ylen - pm.axis;
+        for (int n = 0; n < 4; n++) {
+            x_xyzn[3] = n;
+            xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+            invtrans(xtrans, x_abcd, t_abcd);
+            for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+                y_abcd[yshift+k] = t_abcd[k];
+            }
+            trans(ytrans, y_abcd, t_abcd);
+            abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+            ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
         }
-        trans(ytrans, y_abcd, t_abcd);
-        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
-        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
-      }
-  }
-  VECTOR(P, 4) output = rx + ry;
-  
+    }
+    VECTOR(P, 4) output = rx + ry;
+    
 #ifdef PRELU_CHANNEL
-  VECTOR(P, 4) alpha_value = alpha[gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    VECTOR(P, 4) alpha_value = alpha[gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_ELEMENT
-  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
-  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
-  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+    int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+    VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+    output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
 #endif
 #ifdef PRELU_OTHER
-  P alpha_value = alpha[0];
-  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
-  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
-  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
-  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+    P alpha_value = alpha[0];
+    output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+    output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+    output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+    output.w = output.w > 0 ? output.w : (alpha_value * output.w);
 #endif
-  
-  outTexture.write(output, gid.xy, gid.z);
+    
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
index 8fd1a9fdab8c86fbc52f6dab9c448b7b0f27d403..cca11e80861723668eea05169c060cb7fcc455c2 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal
@@ -17,13 +17,13 @@
 using namespace metal;
 
 struct ElementwiseAddParam {
-  int32_t fast;
-  int32_t axis;
-  int32_t ylen;
-  int32_t xdim[4];
-  int32_t xtrans[4];
-  int32_t ydim[4];
-  int32_t ytrans[4];
+    int32_t fast;
+    int32_t axis;
+    int32_t ylen;
+    int32_t xdim[4];
+    int32_t xtrans[4];
+    int32_t ydim[4];
+    int32_t ytrans[4];
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
index 9655b0fc1a02912cf64b29457a384d95231a25be..114aa1566441163e52f03d201fb848d8185ea75a 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
@@ -23,38 +23,38 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                  device float *output [[buffer(0)]],
-                  uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-
-  int input_width = inTexture.get_width();
-  int input_height = inTexture.get_height();
-  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-  int output_to = 4 * input_width * input_height;
-  
-  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
-  
-  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
-  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
-  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    int input_height = inTexture.get_height();
+    const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
+    int output_to = 4 * input_width * input_height;
+    
+    output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+    
+    output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+    output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+    output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
 }
 
 kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                             device float4 *output [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = float4(input);
+                                 device float4 *output [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = float4(input);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
index 87d304302fe4dbf246ecfed2da1af8172ff717ac..df2de98648ef9ec5cfb8eaf5cc46887aadc04e98 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal
@@ -31,7 +31,7 @@ using namespace metal;
 kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
                               device float *output [[buffer(0)]],
                               uint3 gid [[thread_position_in_grid]]) {
-  
+    
 }
 
 kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
index 368509f001aca6361b81b9b7839cf24b2efc5c12..06bf42697efa18b7100711301ea492447d3c14ce 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal
@@ -23,47 +23,47 @@ kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
 }
 
 struct OutputDim {
-  ushort width;
-  ushort height;
-  ushort strideX;
-  ushort strideY;
+    ushort width;
+    ushort height;
+    ushort strideX;
+    ushort strideY;
 };
 
 kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
                    texture2d_array<half, access::write> outTexture [[texture(1)]],
                    constant OutputDim &params [[buffer(0)]],
                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
-  const half4 input = inTexture.read(pos);
-  outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    
+    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
+    const half4 input = inTexture.read(pos);
+    outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
 }
 
 
 kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
                                   uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height()){
-    return;
-  }
-  const float4 input = inTexture.read(gid.xy);
-  outTexture.write(input, gid.xy, 0);
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height()){
+        return;
+    }
+    const float4 input = inTexture.read(gid.xy);
+    outTexture.write(input, gid.xy, 0);
 }
 
 kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
-                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height()){
-    return;
-  }
-  const half4 input = inTexture.read(gid.xy);
-  outTexture.write(input, gid.xy, 0);
+                                       texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                       uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height()){
+        return;
+    }
+    const half4 input = inTexture.read(gid.xy);
+    outTexture.write(input, gid.xy, 0);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
index 44c57440e1ec138717ad1bc569fd772e0d7ede1a..e32c98cc29f964be4089699bbb035f059f32d0dd 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal
@@ -16,65 +16,65 @@
 using namespace metal;
 
 kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
-    device float *output [[buffer(0)]],
-    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input.x;
-  
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = input.x;
+    
 }
 
 
 kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                             device float *output [[buffer(0)]],
-                             uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input.x;
+                                  device float *output [[buffer(0)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = input.x;
 }
 
 kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
-    device float4 *output [[buffer(0)]],
-    uint3 gid [[thread_position_in_grid]]) {
-  
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-//  int input_height = inTexture.get_height();
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = input;
+                           device float4 *output [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    //  int input_height = inTexture.get_height();
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = input;
 }
 
 kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           device float4 *output [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= inTexture.get_width() ||
-      gid.y >= inTexture.get_height() ||
-      gid.z >= inTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_width = inTexture.get_width();
-//  int input_height = inTexture.get_height();
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  output[gid.y * input_width + gid.x] = float4(input);
+                                device float4 *output [[buffer(0)]],
+                                uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= inTexture.get_width() ||
+        gid.y >= inTexture.get_height() ||
+        gid.z >= inTexture.get_array_size()) {
+        return;
+    }
+    
+    int input_width = inTexture.get_width();
+    //  int input_height = inTexture.get_height();
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    output[gid.y * input_width + gid.x] = float4(input);
 }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
index 3c36ba06f543f6f6cf3e1c234c5326e1f00fdc04..05146b8d14ea9f637ab7fc381f9911c1ad129ad2 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal
@@ -15,36 +15,36 @@
 #ifdef P
 
 kernel void FUNC2_(pool, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                 texture2d_array<P, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  int xmin = gid.x * pm.strideX - pm.paddingX;
-  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-  xmin = max(xmin, 0);
-  int ymin = gid.y * pm.strideX - pm.paddingX;
-  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-  ymin = max(ymin, 0);
-  
-  VECTOR(P, 4) r = 0;
-  if (pm.poolType == 0) {
-    r = inTexture.read(uint2(xmin, ymin), gid.z);
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-      }
+                            texture2d_array<P, access::write> outTexture [[texture(1)]],
+                            constant PoolParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    int xmin = gid.x * pm.strideX - pm.paddingX;
+    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+    xmin = max(xmin, 0);
+    int ymin = gid.y * pm.strideX - pm.paddingX;
+    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+    ymin = max(ymin, 0);
+    
+    VECTOR(P, 4) r = 0;
+    if (pm.poolType == 0) {
+        r = inTexture.read(uint2(xmin, ymin), gid.z);
+        for (int x = xmin; x < xmax; x++) {
+            for (int y = ymin; y < ymax; y++) {
+                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+            }
+        }
+    } else if (pm.poolType == 1) {
+        for (int x = xmin; x < xmax; x++) {
+            for (int y = ymin; y < ymax; y++) {
+                r += inTexture.read(uint2(x, y), gid.z);
+            }
+        }
+        r /= (xmax - xmin) * (ymax - ymin);
     }
-  } else if (pm.poolType == 1) {
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r += inTexture.read(uint2(x, y), gid.z);
-      }
-    }
-    r /= (xmax - xmin) * (ymax - ymin);
-  }
-  outTexture.write(r, gid.xy, gid.z);
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
index e76b4ac74200ff9e61c888166f46deda7b071f2c..30111b7bcb24e6c5eceecfbbcd65430404333a1c 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal
@@ -18,13 +18,13 @@
 using namespace metal;
 
 struct PoolParam {
-  int ksizeX;
-  int ksizeY;
-  int strideX;
-  int strideY;
-  int paddingX;
-  int paddingY;
-  int poolType;
+    int ksizeX;
+    int ksizeY;
+    int strideX;
+    int strideY;
+    int paddingX;
+    int paddingY;
+    int poolType;
 };
 
 #define P half
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
index 597804137743dd253d05d91a5008f558dcaf42e7..6279821436804d8d3459899b986f01d326e35df0 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal
@@ -16,136 +16,136 @@
 using namespace metal;
 
 kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-                           const device float4 *alpha [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  float4 alpha_value = alpha[gid.z];
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    float4 alpha_value = alpha[gid.z];
+    float4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                           texture2d_array<float, access::write> outTexture [[texture(1)]],
                           const device float4 *alpha [[buffer(0)]],
                           uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-
-  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-  float4 alpha_value = alpha[alpha_to + gid.z];
-
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    
+    int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+    float4 alpha_value = alpha[alpha_to + gid.z];
+    
+    float4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<float, access::write> outTexture [[texture(1)]],
-                          const device float *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  float alpha_value = alpha[0];
-  float4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                        texture2d_array<float, access::write> outTexture [[texture(1)]],
+                        const device float *alpha [[buffer(0)]],
+                        uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    float alpha_value = alpha[0];
+    float4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
 kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          const device half4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  half4 alpha_value = alpha[gid.z];
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               const device half4 *alpha [[buffer(0)]],
+                               uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    half4 alpha_value = alpha[gid.z];
+    half4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                          texture2d_array<half, access::write> outTexture [[texture(1)]],
-                          const device half4 *alpha [[buffer(0)]],
-                          uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  
-  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
-  half4 alpha_value = alpha[alpha_to + gid.z];
-  
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               const device half4 *alpha [[buffer(0)]],
+                               uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    
+    int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+    half4 alpha_value = alpha[alpha_to + gid.z];
+    
+    half4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                        texture2d_array<half, access::write> outTexture [[texture(1)]],
-                        const device half *alpha [[buffer(0)]],
-                        uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
-  half alpha_value = alpha[0];
-  half4 output;
-  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
-  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
-  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
-  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
-  outTexture.write(output, gid.xy, gid.z);
+                             texture2d_array<half, access::write> outTexture [[texture(1)]],
+                             const device half *alpha [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]){
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+    half alpha_value = alpha[0];
+    half4 output;
+    output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+    output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+    output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+    output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+    outTexture.write(output, gid.xy, gid.z);
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
index 7630febf77210bb364f0191e8b10a5a6923d6c95..c7f97043bfe8dc614080f510098d7b3e10f73c9a 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal
@@ -16,20 +16,20 @@
 using namespace metal;
 
 struct PriorBoxMetalParam {
-  float offset;
-  float stepWidth;
-  float stepHeight;
-  float minSize;
-  float maxSize;
-  float imageWidth;
-  float imageHeight;
-  
-  bool clip;
-  
-  uint numPriors;
-  uint aspecRatiosSize;
-  uint minSizeSize;
-  uint maxSizeSize;
+    float offset;
+    float stepWidth;
+    float stepHeight;
+    float minSize;
+    float maxSize;
+    float imageWidth;
+    float imageHeight;
+    
+    bool clip;
+    
+    uint numPriors;
+    uint aspecRatiosSize;
+    uint minSizeSize;
+    uint maxSizeSize;
 };
 
 kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
@@ -39,329 +39,329 @@ kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0
                       constant PriorBoxMetalParam &param [[buffer(1)]],
                       const device float4 *variances [[buffer(2)]],
                       uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  if (gid.z < param.aspecRatiosSize) {
-    float ar = aspect_ratios[gid.z];
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
+    
+    float box_width, box_height;
+    
+    if (gid.z < param.aspecRatiosSize) {
+        float ar = aspect_ratios[gid.z];
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(res, gid.xy, gid.z);
+    } else if (gid.z >= param.aspecRatiosSize) {
+        if (param.maxSizeSize > 0) {
+            box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+            float4 max_box;
+            max_box.x = (center_x - box_width) / param.imageWidth;
+            max_box.y = (center_y - box_height) / param.imageHeight;
+            max_box.z = (center_x + box_width) / param.imageWidth;
+            max_box.w = (center_y + box_height) / param.imageHeight;
+            
+            float4 res;
+            if (param.clip) {
+                res = min(max(max_box, 0.0), 1.0);
+            } else {
+                res = max_box;
+            }
+            outBoxTexture.write(max_box, gid.xy, gid.z);
+        }
     }
     
-    outBoxTexture.write(res, gid.xy, gid.z);
-  } else if (gid.z >= param.aspecRatiosSize) {
-    if (param.maxSizeSize > 0) {
-      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-      float4 max_box;
-      max_box.x = (center_x - box_width) / param.imageWidth;
-      max_box.y = (center_y - box_height) / param.imageHeight;
-      max_box.z = (center_x + box_width) / param.imageWidth;
-      max_box.w = (center_y + box_height) / param.imageHeight;
-
-      float4 res;
-      if (param.clip) {
-        res = min(max(max_box, 0.0), 1.0);
-      } else {
-        res = max_box;
-      }
-      outBoxTexture.write(max_box, gid.xy, gid.z);
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(variances_output, gid.xy, gid.z);
     }
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(variances_output, gid.xy, gid.z);
-  }
 }
 
 
 kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                      const device half *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  if (gid.z < param.aspecRatiosSize) {
-    half ar = aspect_ratios[gid.z];
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                           const device half *aspect_ratios [[buffer(0)]],
+                           constant PriorBoxMetalParam &param [[buffer(1)]],
+                           const device float4 *variances [[buffer(2)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
+    
+    float box_width, box_height;
+    
+    if (gid.z < param.aspecRatiosSize) {
+        half ar = aspect_ratios[gid.z];
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
+    } else if (gid.z >= param.aspecRatiosSize) {
+        if (param.maxSizeSize > 0) {
+            box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+            float4 max_box;
+            max_box.x = (center_x - box_width) / param.imageWidth;
+            max_box.y = (center_y - box_height) / param.imageHeight;
+            max_box.z = (center_x + box_width) / param.imageWidth;
+            max_box.w = (center_y + box_height) / param.imageHeight;
+            
+            float4 res;
+            if (param.clip) {
+                res = min(max(max_box, 0.0), 1.0);
+            } else {
+                res = max_box;
+            }
+            outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+        }
     }
     
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  } else if (gid.z >= param.aspecRatiosSize) {
-    if (param.maxSizeSize > 0) {
-      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-      float4 max_box;
-      max_box.x = (center_x - box_width) / param.imageWidth;
-      max_box.y = (center_y - box_height) / param.imageHeight;
-      max_box.z = (center_x + box_width) / param.imageWidth;
-      max_box.w = (center_y + box_height) / param.imageHeight;
-      
-      float4 res;
-      if (param.clip) {
-        res = min(max(max_box, 0.0), 1.0);
-      } else {
-        res = max_box;
-      }
-      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(half4(variances_output), gid.xy, gid.z);
     }
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-  }
 }
 
 
 
 kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
-                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
-                      const device float *aspect_ratios [[buffer(0)]],
-                      constant PriorBoxMetalParam &param [[buffer(1)]],
-                      const device float4 *variances [[buffer(2)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  
-  
-  if (gid.z == 0) {
-    box_width = box_height = param.minSize / 2;
+                                              texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                                              texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                                              const device float *aspect_ratios [[buffer(0)]],
+                                              constant PriorBoxMetalParam &param [[buffer(1)]],
+                                              const device float4 *variances [[buffer(2)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    float box_width, box_height;
+    
+    
+    
+    if (gid.z == 0) {
+        box_width = box_height = param.minSize / 2;
+        
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(res, gid.xy, gid.z);
     }
-
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  if (gid.z == 1 && param.maxSizeSize > 0) {
     
-    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-    float4 max_box;
-    max_box.x = (center_x - box_width) / param.imageWidth;
-    max_box.y = (center_y - box_height) / param.imageHeight;
-    max_box.z = (center_x + box_width) / param.imageWidth;
-    max_box.w = (center_y + box_height) / param.imageHeight;
+    if (gid.z == 1 && param.maxSizeSize > 0) {
+        
+        box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+        float4 max_box;
+        max_box.x = (center_x - box_width) / param.imageWidth;
+        max_box.y = (center_y - box_height) / param.imageHeight;
+        max_box.z = (center_x + box_width) / param.imageWidth;
+        max_box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = min(max(max_box, 0.0), 1.0);
+        } else {
+            res = max_box;
+        }
+        outBoxTexture.write(res, gid.xy, gid.z);
+    }
     
-    float4 res;
-    if (param.clip) {
-      res = min(max(max_box, 0.0), 1.0);
+    int aspect_to = 0;
+    if (param.maxSizeSize > 0) {
+        aspect_to = gid.z - 2;
     } else {
-      res = max_box;
+        aspect_to = gid.z - 1;
     }
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  int aspect_to = 0;
-  if (param.maxSizeSize > 0) {
-    aspect_to = gid.z - 2;
-  } else {
-    aspect_to = gid.z - 1;
-  }
-  
-
-  
-  
-  if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
     
-    int skip = 0;
-    for (int i = 0; i < aspect_to + 1; ++i) {
-      if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
-        skip += 1;
-      }
-    }
-    aspect_to += skip;
     
-    float ar = aspect_ratios[aspect_to];
     
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
+    if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
+        
+        int skip = 0;
+        for (int i = 0; i < aspect_to + 1; ++i) {
+            if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
+                skip += 1;
+            }
+        }
+        aspect_to += skip;
+        
+        float ar = aspect_ratios[aspect_to];
+        
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(res, gid.xy, gid.z);
     }
     
-    outBoxTexture.write(res, gid.xy, gid.z);
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(variances_output, gid.xy, gid.z);
-  }
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(variances_output, gid.xy, gid.z);
+    }
 }
 
 
 kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
-                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
-                           const device half *aspect_ratios [[buffer(0)]],
-                           constant PriorBoxMetalParam &param [[buffer(1)]],
-                           const device float4 *variances [[buffer(2)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outBoxTexture.get_width() ||
-      gid.y >= outBoxTexture.get_height() ||
-      gid.z >= outBoxTexture.get_array_size()) return;
-  
-  float center_x = (gid.x + param.offset) * param.stepWidth;
-  float center_y = (gid.y + param.offset) * param.stepHeight;
-  
-  float box_width, box_height;
-  
-  
-  
-  if (gid.z == 0) {
-    box_width = box_height = param.minSize / 2;
+                                                   texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                                                   texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                                                   const device half *aspect_ratios [[buffer(0)]],
+                                                   constant PriorBoxMetalParam &param [[buffer(1)]],
+                                                   const device float4 *variances [[buffer(2)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outBoxTexture.get_width() ||
+        gid.y >= outBoxTexture.get_height() ||
+        gid.z >= outBoxTexture.get_array_size()) return;
     
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    float center_x = (gid.x + param.offset) * param.stepWidth;
+    float center_y = (gid.y + param.offset) * param.stepHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
-    } else {
-      res = box;
-    }
+    float box_width, box_height;
     
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  if (gid.z == 1 && param.maxSizeSize > 0) {
     
-    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
-    float4 max_box;
-    max_box.x = (center_x - box_width) / param.imageWidth;
-    max_box.y = (center_y - box_height) / param.imageHeight;
-    max_box.z = (center_x + box_width) / param.imageWidth;
-    max_box.w = (center_y + box_height) / param.imageHeight;
     
-    float4 res;
-    if (param.clip) {
-      res = min(max(max_box, 0.0), 1.0);
-    } else {
-      res = max_box;
+    if (gid.z == 0) {
+        box_width = box_height = param.minSize / 2;
+        
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
     }
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  int aspect_to = 0;
-  if (param.maxSizeSize > 0) {
-    aspect_to = gid.z - 2;
-  } else {
-    aspect_to = gid.z - 1;
-  }
-  
-  if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
-    float ar = aspect_ratios[aspect_to];
     
-    box_width = param.minSize * sqrt(ar) / 2;
-    box_height = param.minSize / sqrt(ar) / 2;
-    float4 box;
-    box.x = (center_x - box_width) / param.imageWidth;
-    box.y = (center_y - box_height) / param.imageHeight;
-    box.z = (center_x + box_width) / param.imageWidth;
-    box.w = (center_y + box_height) / param.imageHeight;
+    if (gid.z == 1 && param.maxSizeSize > 0) {
+        
+        box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+        float4 max_box;
+        max_box.x = (center_x - box_width) / param.imageWidth;
+        max_box.y = (center_y - box_height) / param.imageHeight;
+        max_box.z = (center_x + box_width) / param.imageWidth;
+        max_box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = min(max(max_box, 0.0), 1.0);
+        } else {
+            res = max_box;
+        }
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
+    }
     
-    float4 res;
-    if (param.clip) {
-      res = fmin(fmax(box, 0.0), 1.0);
+    int aspect_to = 0;
+    if (param.maxSizeSize > 0) {
+        aspect_to = gid.z - 2;
     } else {
-      res = box;
+        aspect_to = gid.z - 1;
     }
     
-    outBoxTexture.write(half4(res), gid.xy, gid.z);
-  }
-  
-  float4 variance = variances[0];
-  if (gid.z < param.numPriors) {
-    float4 variances_output;
-    variances_output.x = variance.x;
-    variances_output.y = variance.y;
-    variances_output.z = variance.z;
-    variances_output.w = variance.w;
-    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
-  }
+    if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
+        float ar = aspect_ratios[aspect_to];
+        
+        box_width = param.minSize * sqrt(ar) / 2;
+        box_height = param.minSize / sqrt(ar) / 2;
+        float4 box;
+        box.x = (center_x - box_width) / param.imageWidth;
+        box.y = (center_y - box_height) / param.imageHeight;
+        box.z = (center_x + box_width) / param.imageWidth;
+        box.w = (center_y + box_height) / param.imageHeight;
+        
+        float4 res;
+        if (param.clip) {
+            res = fmin(fmax(box, 0.0), 1.0);
+        } else {
+            res = box;
+        }
+        
+        outBoxTexture.write(half4(res), gid.xy, gid.z);
+    }
+    
+    float4 variance = variances[0];
+    if (gid.z < param.numPriors) {
+        float4 variances_output;
+        variances_output.x = variance.x;
+        variances_output.y = variance.y;
+        variances_output.z = variance.z;
+        variances_output.w = variance.w;
+        varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+    }
 }
 
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
index e725440bbe997d571f1860bce323516144a94da8..725222d75e1b0c40ecfd2e4f95f35c13e7851e21 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal
@@ -17,25 +17,25 @@ using namespace metal;
 
 
 kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const half4 input = inTexture.read(gid.xy, gid.z);
-  const float4 relu = fmax((float4)input, 0.0);
-  outTexture.write(half4(relu), gid.xy, gid.z);
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const half4 input = inTexture.read(gid.xy, gid.z);
+    const float4 relu = fmax((float4)input, 0.0);
+    outTexture.write(half4(relu), gid.xy, gid.z);
 }
 
 kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-  const float4 input = inTexture.read(gid.xy, gid.z);
-  const float4 relu = fmax((float4)input, 0.0);
-  outTexture.write(float4(relu), gid.xy, gid.z);
+                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const float4 input = inTexture.read(gid.xy, gid.z);
+    const float4 relu = fmax((float4)input, 0.0);
+    outTexture.write(float4(relu), gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
index 7583537c2b404b7a95eeedfb4c69793a608f18ac..3037e404a354ae6471db9056d84661af1434e2f7 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal
@@ -24,43 +24,43 @@
 #define FUNC_R(f, r) CONCAT2_(f, r)
 
 kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                    texture2d_array<P, access::write> outTexture [[texture(1)]],
-                    constant ReshapeParam &rp [[buffer(0)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-
-  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
-  ReshapeParam lrp = rp;
-  int oC = lrp.odim[lrp.otrans[3]];
-  int iC = lrp.idim[lrp.itrans[3]];
-  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
-  VECTOR(P, 4) r;
-  for (int n = 0; n < 4; n++) {
-    oxyzn[3] = n;
+                                        texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                        constant ReshapeParam &rp [[buffer(0)]],
+                                        uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    
+    int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
+    ReshapeParam lrp = rp;
+    int oC = lrp.odim[lrp.otrans[3]];
+    int iC = lrp.idim[lrp.itrans[3]];
+    int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
+    VECTOR(P, 4) r;
+    for (int n = 0; n < 4; n++) {
+        oxyzn[3] = n;
 #if ROUT == 4
-    xyzn2abcd_4(oC, oxyzn, oabcd);
+        xyzn2abcd_4(oC, oxyzn, oabcd);
 #else
-    FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
+        FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
 #endif
-    int tabcd[4];
-    invtrans(lrp.otrans, oabcd, tabcd);
-    int index = abcd2index(lrp.odim, tabcd);
-    if (index < count) {
-      index2abcd(lrp.idim, index, tabcd);
-      trans(lrp.itrans, tabcd, iabcd);
+        int tabcd[4];
+        invtrans(lrp.otrans, oabcd, tabcd);
+        int index = abcd2index(lrp.odim, tabcd);
+        if (index < count) {
+            index2abcd(lrp.idim, index, tabcd);
+            trans(lrp.itrans, tabcd, iabcd);
 #if RIN == 4
-      abcd2xyzn_4(iC, iabcd, ixyzn);
+            abcd2xyzn_4(iC, iabcd, ixyzn);
 #else
-      FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
+            FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
 #endif
-      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-    } else {
-      r[n] = 0;
+            r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+        } else {
+            r[n] = 0;
+        }
     }
-  }
-  outTexture.write(r, gid.xy, gid.z);
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
index d2f5815d422ec8c4f3e1e3c1992855547e002264..bb155a87a3fe5f7acfb633eded934b64ea4df178 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal
@@ -18,10 +18,10 @@
 using namespace metal;
 
 struct ReshapeParam {
-  int32_t idim[4];
-  int32_t itrans[4];
-  int32_t odim[4];
-  int32_t otrans[4];
+    int32_t idim[4];
+    int32_t itrans[4];
+    int32_t odim[4];
+    int32_t otrans[4];
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
index fbb4e12cb82c12f8dc5b94c397e43b8c8c5ae518..3cca15d5518b37743ec8fb6283a4d8583e0520b6 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal
@@ -16,60 +16,60 @@
 using namespace metal;
 
 struct resize_bilinear_param {
-//  int32_t out_h;
-//  int32_t out_w;
-  float ratio_h;
-  float ratio_w;
+    //  int32_t out_h;
+    //  int32_t out_w;
+    float ratio_h;
+    float ratio_w;
 };
 
 kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
-                     texture2d_array<float, access::write> output [[texture(2)]],
-                     constant resize_bilinear_param & pm [[buffer(0)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  float4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    float w = gid.x * pm.ratio_w;
-    float h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    float w1lambda = w - w0, h1lambda = h - h0;
-    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    float4 r0 = input.read(uint2(w0, h0), gid.z);
-    float4 r1 = input.read(uint2(w1, h0), gid.z);
-    float4 r2 = input.read(uint2(w0, h1), gid.z);
-    float4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
+                            texture2d_array<float, access::write> output [[texture(2)]],
+                            constant resize_bilinear_param & pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+    float4 r;
+    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+        r = input.read(gid.xy, gid.z);
+    } else {
+        float w = gid.x * pm.ratio_w;
+        float h = gid.y * pm.ratio_h;
+        uint w0 = w, h0 = h;
+        uint w1 = w0 + 1, h1 = h0 + 1;
+        float w1lambda = w - w0, h1lambda = h - h0;
+        float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+        if (w1 >= input.get_width()) w1 = w0;
+        if (h1 >= input.get_height()) h1 = h0;
+        float4 r0 = input.read(uint2(w0, h0), gid.z);
+        float4 r1 = input.read(uint2(w1, h0), gid.z);
+        float4 r2 = input.read(uint2(w0, h1), gid.z);
+        float4 r3 = input.read(uint2(w1, h1), gid.z);
+        r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+    }
+    output.write(r, gid.xy, gid.z);
 }
 
 kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
-                            texture2d_array<half, access::write> output [[texture(2)]],
-                            constant resize_bilinear_param & pm [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-  
-  half4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    half w = gid.x * pm.ratio_w;
-    half h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    half w1lambda = w - w0, h1lambda = h - h0;
-    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    half4 r0 = input.read(uint2(w0, h0), gid.z);
-    half4 r1 = input.read(uint2(w1, h0), gid.z);
-    half4 r2 = input.read(uint2(w0, h1), gid.z);
-    half4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-  output.write(r, gid.xy, gid.z);
+                                 texture2d_array<half, access::write> output [[texture(2)]],
+                                 constant resize_bilinear_param & pm [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    
+    half4 r;
+    if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+        r = input.read(gid.xy, gid.z);
+    } else {
+        half w = gid.x * pm.ratio_w;
+        half h = gid.y * pm.ratio_h;
+        uint w0 = w, h0 = h;
+        uint w1 = w0 + 1, h1 = h0 + 1;
+        half w1lambda = w - w0, h1lambda = h - h0;
+        half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+        if (w1 >= input.get_width()) w1 = w0;
+        if (h1 >= input.get_height()) h1 = h0;
+        half4 r0 = input.read(uint2(w0, h0), gid.z);
+        half4 r1 = input.read(uint2(w1, h0), gid.z);
+        half4 r2 = input.read(uint2(w0, h1), gid.z);
+        half4 r3 = input.read(uint2(w1, h1), gid.z);
+        r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+    }
+    output.write(r, gid.xy, gid.z);
+    output.write(r, gid.xy, gid.z);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
index ae4ccdef751535765b78b0573b012ace30b16811..62b5fd0c929e5dae1d6dbb1e70c739b59b8b7192 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal
@@ -10,21 +10,21 @@
 using namespace metal;
 
 kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) return;
-  float w_stride = inTexture.get_width() / outTexture.get_width();
-  float h_stride = inTexture.get_height() / outTexture.get_height();
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-  outTexture.write(input, gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) return;
+    float w_stride = inTexture.get_width() / outTexture.get_width();
+    float h_stride = inTexture.get_height() / outTexture.get_height();
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+    outTexture.write(input, gid);
 }
 
 kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) return;
-  float w_stride = inTexture.get_width() / outTexture.get_width();
-  float h_stride = inTexture.get_height() / outTexture.get_height();
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-  outTexture.write(half4(input), gid);
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) return;
+    float w_stride = inTexture.get_width() / outTexture.get_width();
+    float h_stride = inTexture.get_height() / outTexture.get_height();
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+    outTexture.write(half4(input), gid);
 }
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
index 455cf1471b5c369fc27040e03b57812e8d6bf0e8..3affcadd799e3a95e2f39822b4089094003b1cff 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal
@@ -21,41 +21,41 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                    texture2d_array<P, access::write> outTexture [[texture(1)]],
-                    constant SoftmaxParam &sp [[buffer(0)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-//  int zsize = inTexture.get_array_size();
-  P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
-  int group = sp.K / 4;
-  int remain = sp.K % 4;
-  for (int x = 0; x < group; x++) {
-    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
-  }
-  if (remain > 0) {
-    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-    for (int i = 0; i < remain; i++) {
-      maxv = max(maxv, r[i]);
+                             texture2d_array<P, access::write> outTexture [[texture(1)]],
+                             constant SoftmaxParam &sp [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) return;
+    //  int zsize = inTexture.get_array_size();
+    P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
+    int group = sp.K / 4;
+    int remain = sp.K % 4;
+    for (int x = 0; x < group; x++) {
+        VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+        maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
     }
-  }
-  VECTOR(P, 4) rsum = {0, 0, 0, 0};
-  for (int x = 0; x < group; x++) {
-    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
-    rsum += exp(r - maxv);
-  }
-  P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
-  if (remain > 0) {
-    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
-    for (int i = 0; i < remain; i++) {
-      sum += exp(r[i] - maxv);
+    if (remain > 0) {
+        VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+        for (int i = 0; i < remain; i++) {
+            maxv = max(maxv, r[i]);
+        }
     }
-  }
-  VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
-  rr = exp(rr - maxv) / sum;
-  outTexture.write(rr, gid.xy, gid.z);
+    VECTOR(P, 4) rsum = {0, 0, 0, 0};
+    for (int x = 0; x < group; x++) {
+        VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+        rsum += exp(r - maxv);
+    }
+    P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
+    if (remain > 0) {
+        VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+        for (int i = 0; i < remain; i++) {
+            sum += exp(r[i] - maxv);
+        }
+    }
+    VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
+    rr = exp(rr - maxv) / sum;
+    outTexture.write(rr, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
index 67c279a4441095e710985c65d85aac589b7d0f54..f4bc8de4bc0f825d7e40d3e9deb0a8579cbae47b 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal
@@ -16,8 +16,8 @@
 using namespace metal;
 
 struct SoftmaxParam {
-  int N;
-  int K;
+    int N;
+    int K;
 };
 
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
index 54e3f21e793a9c1474f13fed61857211cb7d117f..1c9bcc7e18874316db68809688622dc7ec12058b 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal
@@ -36,41 +36,41 @@
 
 #if V == VY
 kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
-                                 texture2d_array<P, access::write> out1 [[texture(1)]],
-                                 texture2d_array<P, access::write> out2 [[texture(2)]],
+                                     texture2d_array<P, access::write> out1 [[texture(1)]],
+                                     texture2d_array<P, access::write> out2 [[texture(2)]],
 #if N >= 3
-                                 texture2d_array<P, access::write> out3 [[texture(3)]],
+                                     texture2d_array<P, access::write> out3 [[texture(3)]],
 #endif // N >= 3
 #if N >= 4
-                                 texture2d_array<P, access::write> out4 [[texture(4)]],
+                                     texture2d_array<P, access::write> out4 [[texture(4)]],
 #endif // N >= 4
-                                 constant SplitParam &sp [[buffer(0)]],
-                                 uint3 gid [[thread_position_in_grid]]) {
-
-  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-  int y = gid.y - sp.offset;
-  if (y < sp.vdim[0]) {
-    out1.write(r, gid.xy, gid.z);
-    return;
-  }
-  y -= sp.vdim[0];
-  if (y < sp.vdim[1]) {
-    out2.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
+                                     constant SplitParam &sp [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+    
+    VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+    int y = gid.y - sp.offset;
+    if (y < sp.vdim[0]) {
+        out1.write(r, gid.xy, gid.z);
+        return;
+    }
+    y -= sp.vdim[0];
+    if (y < sp.vdim[1]) {
+        out2.write(r, uint2(gid.x, y), gid.z);
+        return;
+    }
 #if N >= 3
-  y -= sp.vdim[1];
-  if (y < sp.vdim[2]) {
-    out3.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
+    y -= sp.vdim[1];
+    if (y < sp.vdim[2]) {
+        out3.write(r, uint2(gid.x, y), gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  y -= sp.vdim[2];
-  if (y < sp.vdim[3]) {
-    out4.write(r, uint2(gid.x, y), gid.z);
-    return;
-  }
+    y -= sp.vdim[2];
+    if (y < sp.vdim[3]) {
+        out4.write(r, uint2(gid.x, y), gid.z);
+        return;
+    }
 #endif // N >= 4
 }
 #endif // V == VY
@@ -88,30 +88,30 @@ kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[te
 #endif // N >= 4
                                      constant SplitParam &sp [[buffer(0)]],
                                      uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
-  int x = gid.x;
-  if (x < sp.vdim[0]) {
-    out1.write(r, gid.xy, gid.z);
-    return;
-  }
-  x -= sp.vdim[0];
-  if (x < sp.vdim[1]) {
-    out2.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
+    VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+    int x = gid.x;
+    if (x < sp.vdim[0]) {
+        out1.write(r, gid.xy, gid.z);
+        return;
+    }
+    x -= sp.vdim[0];
+    if (x < sp.vdim[1]) {
+        out2.write(r, uint2(x, gid.y), gid.z);
+        return;
+    }
 #if N >= 3
-  x -= sp.vdim[1];
-  if (x < sp.vdim[2]) {
-    out3.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
+    x -= sp.vdim[1];
+    if (x < sp.vdim[2]) {
+        out3.write(r, uint2(x, gid.y), gid.z);
+        return;
+    }
 #endif // N >= 3
 #if N >= 4
-  x -= sp.vdim[2];
-  if (x < sp.vdim[3]) {
-    out4.write(r, uint2(x, gid.y), gid.z);
-    return;
-  }
+    x -= sp.vdim[2];
+    if (x < sp.vdim[3]) {
+        out4.write(r, uint2(x, gid.y), gid.z);
+        return;
+    }
 #endif // N >= 4
 }
 #endif // V == VX
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
index 4c1e818d2bf5c7266169f406fbfaf8e322685dc4..d167608fbb54793f08f2bc620101fa3492293c0f 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal
@@ -18,11 +18,11 @@
 using namespace metal;
 
 struct SplitParam {
-  int32_t idim[4];
-  int32_t axis;
-  int32_t offset;
-  int32_t trans[4];
-  int32_t vdim[4];
+    int32_t idim[4];
+    int32_t axis;
+    int32_t offset;
+    int32_t trans[4];
+    int32_t vdim[4];
 };
 
 #define VNORMAL 1
@@ -36,29 +36,29 @@ struct SplitParam {
 
 //// ssd-ar: (R=3, N=2, V=y)
 #define V VY
-  #define R 3
-    #define N 2
-      #define P float
-        #include "Split.inc.metal"
-      #undef P
-      #define P half
-        #include "Split.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 3
+#define N 2
+#define P float
+#include "Split.inc.metal"
+#undef P
+#define P half
+#include "Split.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
 
 
 //// ssd-ar: (R=2, N=2, V=y)
 #define V VY
-  #define R 2
-    #define N 2
-      #define P float
-        #include "Split.inc.metal"
-      #undef P
-      #define P half
-        #include "Split.inc.metal"
-      #undef P
-    #undef N
-  #undef R
+#define R 2
+#define N 2
+#define P float
+#include "Split.inc.metal"
+#undef P
+#define P half
+#include "Split.inc.metal"
+#undef P
+#undef N
+#undef R
 #undef V
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
index 534166e45fc3db49cc5de526ec0d5179ca3f9899..d80361da46d53be81314711ad4d3c6e5420fcdc4 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal
@@ -22,39 +22,39 @@
 #define VECTOR(p, n) CONCAT2(p, n)
 
 kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-                      texture2d_array<P, access::write> outTexture [[texture(1)]],
-                      constant TransposeParam &pm [[buffer(0)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  VECTOR(P, 4) r;
-  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
-  int iabcd[4], oabcd[4], ixyzn[4];
-  for (int n = 0; n < 4; n++) {
-    oxyzn[3] = n;
+                                  texture2d_array<P, access::write> outTexture [[texture(1)]],
+                                  constant TransposeParam &pm [[buffer(0)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+    VECTOR(P, 4) r;
+    int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
+    int iabcd[4], oabcd[4], ixyzn[4];
+    for (int n = 0; n < 4; n++) {
+        oxyzn[3] = n;
 #if R == 4
-    xyzn2abcd_4(pm.oC, oxyzn, iabcd);
+        xyzn2abcd_4(pm.oC, oxyzn, iabcd);
 #endif // R == 4
 #if R == 3
-    xyzn2abcd_3(oxyzn, oabcd);
+        xyzn2abcd_3(oxyzn, oabcd);
 #endif // R == 3
 #if R == 2
-    xyzn2abcd_2(oxyzn, oabcd);
+        xyzn2abcd_2(oxyzn, oabcd);
 #endif // R == 2
-    iabcd[pm.axis[0]] = oabcd[0];
-    iabcd[pm.axis[1]] = oabcd[1];
-    iabcd[pm.axis[2]] = oabcd[2];
-    iabcd[pm.axis[3]] = oabcd[3];
+        iabcd[pm.axis[0]] = oabcd[0];
+        iabcd[pm.axis[1]] = oabcd[1];
+        iabcd[pm.axis[2]] = oabcd[2];
+        iabcd[pm.axis[3]] = oabcd[3];
 #if R == 4
-    abcd2xyzn_4(pm.iC, iabcd, ixyzn);
+        abcd2xyzn_4(pm.iC, iabcd, ixyzn);
 #endif // R == 4
 #if R == 3
-    abcd2xyzn_3(iabcd, ixyzn);
+        abcd2xyzn_3(iabcd, ixyzn);
 #endif // R == 3
 #if R == 2
-    abcd2xyzn_2(iabcd, ixyzn);
+        abcd2xyzn_2(iabcd, ixyzn);
 #endif // R == 2
-    r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
-  }
-  outTexture.write(r, gid.xy, gid.z);
+        r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+    }
+    outTexture.write(r, gid.xy, gid.z);
 }
 
 #endif
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
index 321663b9b7f09eba2041cb0932215d291e44aba6..66c22f03883cb0cdcac9eff9866718735908ca0a 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal
@@ -17,47 +17,47 @@
 using namespace metal;
 
 struct TransposeParam {
-  int iC;
-  int oC;
-  int axis[4];
+    int iC;
+    int oC;
+    int axis[4];
 };
 
 kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-                           constant TransposeParam &pm [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+                                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                 constant TransposeParam &pm [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+    outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
 }
 kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<half, access::write> outTexture [[texture(1)]],
-                           constant TransposeParam &pm [[buffer(0)]],
-                           uint3 gid [[thread_position_in_grid]]) {
-  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+                                texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                constant TransposeParam &pm [[buffer(0)]],
+                                uint3 gid [[thread_position_in_grid]]) {
+    outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
 }
 
 #define R 4
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
+#define P float
+#include "TransposeKernel.inc.metal"
+#undef P
+#define P half
+#include "TransposeKernel.inc.metal"
+#undef P
 #undef R
 
 #define R 3
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
+#define P float
+#include "TransposeKernel.inc.metal"
+#undef P
+#define P half
+#include "TransposeKernel.inc.metal"
+#undef P
 #undef R
 
 #define R 2
-  #define P float
-    #include "TransposeKernel.inc.metal"
-  #undef P
-  #define P half
-    #include "TransposeKernel.inc.metal"
-  #undef P
+#define P float
+#include "TransposeKernel.inc.metal"
+#undef P
+#define P half
+#include "TransposeKernel.inc.metal"
+#undef P
 #undef R
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
index 7817befaedf1aff04b75abd39cc6f7f06bc935d3..29730fd3b6209f27bf489d71fa0ada72c9c7db58 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
@@ -1,11 +1,11 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
-
+ 
  http://www.apache.org/licenses/LICENSE-2.0
-
+ 
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,35 +16,35 @@ import UIKit
 
 @UIApplicationMain
 class AppDelegate: UIResponder, UIApplicationDelegate {
-
+    
     var window: UIWindow?
-
+    
     func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
         // Override point for customization after application launch.
         return true
     }
-
+    
     func applicationWillResignActive(_ application: UIApplication) {
         // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
         // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
     }
-
+    
     func applicationDidEnterBackground(_ application: UIApplication) {
         // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
         // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
     }
-
+    
     func applicationWillEnterForeground(_ application: UIApplication) {
         // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
     }
-
+    
     func applicationDidBecomeActive(_ application: UIApplication) {
         // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
     }
-
+    
     func applicationWillTerminate(_ application: UIApplication) {
         // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
     }
-
-
+    
+    
 }
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
index 98f03affa2a230b2698edf6bafe5e06def8986b6..4c5886c7c1d8504d418c958de4dfdd4240303529 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
@@ -1,11 +1,11 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
-
+ 
  http://www.apache.org/licenses/LICENSE-2.0
-
+ 
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,9 +27,9 @@ class ViewController: UIViewController {
             inQueue: queue
         )
         test.testConcat()
-//        test.testReshape()
-//        test.testTranspose()
+        //        test.testReshape()
+        //        test.testTranspose()
         print(" done ")
     }
-
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
index 3aa4e88541dce7de9808e3ba7da545f4de015481..afa580e3cb0be7e5a3f13bf00f9ae355f8e7d9f2 100644
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -741,7 +741,7 @@
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = A798K58VVL;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
@@ -778,7 +778,7 @@
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
-				DEVELOPMENT_TEAM = "";
+				DEVELOPMENT_TEAM = A798K58VVL;
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
diff --git a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
index da66460d8bc895ce3b31e1ec7866765827515054..ba15d890a4fcdecc011c2d6dfdeddf87dab3a94e 100644
--- a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
@@ -15,24 +15,26 @@
 import Foundation
 
 @objc public enum MetalLoadMode: Int {
-  case
-  LoadMetalInPaddleMobile   = 1,     // 使用 paddle-mobile 中的 metal 代码
-  LoadMetalInDefaultLib     = 2,     // 使用 main bundle 中的 metal 代码
-  LoadMetalInCustomMetalLib = 3      // 使用 metal 库文件
+    case
+    LoadMetalInPaddleMobile   = 1,     // 使用 paddle-mobile 中的 metal 代码
+    LoadMetalInDefaultLib     = 2,     // 使用 main bundle 中的 metal 代码
+    LoadMetalInCustomMetalLib = 3      // 使用 metal 库文件
 }
 
 @objc public enum ComputePrecision: Int {
-  case
-  Float32 = 1,
-  Float16 = 2
+    case
+    Float32 = 1,
+    Float16 = 2
 }
 
 @objc public class GlobalConfig: NSObject {
-  
-  /// 单例
-  @objc public static let shared: GlobalConfig = GlobalConfig.init()
-  
-  /// 运算精度， runner 生命周期中不可变
-  @objc public var computePrecision: ComputePrecision = .Float16
-
+    
+    /// 单例
+    @objc public static let shared: GlobalConfig = GlobalConfig.init()
+    
+    /// 运算精度， runner 生命周期中不可变
+    @objc public var computePrecision: ComputePrecision = .Float16
+    
+    /// 是否开启 log
+    @objc public var debug: Bool = true;
 }
diff --git a/metal/paddle-mobile/paddle-mobile/API/Net.swift b/metal/paddle-mobile/paddle-mobile/API/Net.swift
index 33cedb5712da3172f2f1b518d1866ec8ac9a3283..5087ebfd824ef4b8a4e7a137ac186f2a01c9f00f 100644
--- a/metal/paddle-mobile/paddle-mobile/API/Net.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/Net.swift
@@ -17,74 +17,74 @@ import Foundation
 
 /// 网络的基类， 参数已经给了默认值，请在子类实现中修改需要改的参数
 @objc open class Net: NSObject {
-
-  /// 默认为0， 如果指定个数， 后边 except 个op不使用 GPU 运算， 中间结果会通过 fetchResult 传参过来
-  @objc public var except: Int = 0
-      
-  /// 预处理 kernel， 如果输入图像需要预处理， 则指定预处理 kernel
-  @objc public var preprocessKernel: CusomKernel? = nil
-  
-  // 以下四个参数为从内存中读取模型时用到的参数
-  /// 模型在内存中的指针
-  @objc public var modelPointer: UnsafeMutableRawPointer? = nil
-  
-  /// 模型大小 单位： 字节
-  @objc public var modelSize: Int = 0
-  
-  /// 权重参数在内存中的指针
-  @objc public var paramPointer: UnsafeMutableRawPointer? = nil
-  
-  /// 权重大小 单位： 字节
-  @objc public var paramSize: Int = 0
-  
-  // 以下两个为从文件中读取模型时用到的参数
-  /// 模型文件路径
-  @objc public var modelPath: String? = nil
-  
-  /// 权重文件路径
-  @objc public var paramPath: String? = nil
-  
-  /// 代表着 GPU 处理器
-  @objc public let device: MTLDevice
-  
-  /// metal 代码加载方式 注意： 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码
-  @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile
-  
-  /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
-  @objc public var metalLibPath: String? = nil
-  
-  /// 输入维度，按照 n h w c 方式传入
-  @objc public var inputDim: Dim = Dim.init(inDim: [])
-  
-  
-  @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
-    self.paramPointer = inParamPointer
-    self.paramSize = inParamSize
-    self.modelPointer = inModelPointer
-    self.modelSize = inModelSize
-    self.device = device
-    super.init()
-  }
-  
-  @objc public init(device: MTLDevice) {
-    self.device = device
-    super.init()
-  }
-  
-  @objc open func resultStr(res: [ResultHolder]) -> String {
-    fatalError()
-  }
-  
-  @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
-    return paddleMobileRes.map { (gpuRes) -> ResultHolder in
-      guard let inResPointer = gpuRes.resultPointer else {
+    
+    /// 默认为0， 如果指定个数， 后边 except 个op不使用 GPU 运算， 中间结果会通过 fetchResult 传参过来
+    @objc public var except: Int = 0
+    
+    /// 预处理 kernel， 如果输入图像需要预处理， 则指定预处理 kernel
+    @objc public var preprocessKernel: CusomKernel? = nil
+    
+    // 以下四个参数为从内存中读取模型时用到的参数
+    /// 模型在内存中的指针
+    @objc public var modelPointer: UnsafeMutableRawPointer? = nil
+    
+    /// 模型大小 单位： 字节
+    @objc public var modelSize: Int = 0
+    
+    /// 权重参数在内存中的指针
+    @objc public var paramPointer: UnsafeMutableRawPointer? = nil
+    
+    /// 权重大小 单位： 字节
+    @objc public var paramSize: Int = 0
+    
+    // 以下两个为从文件中读取模型时用到的参数
+    /// 模型文件路径
+    @objc public var modelPath: String? = nil
+    
+    /// 权重文件路径
+    @objc public var paramPath: String? = nil
+    
+    /// 代表着 GPU 处理器
+    @objc public let device: MTLDevice
+    
+    /// metal 代码加载方式 注意： 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码
+    @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile
+    
+    /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
+    @objc public var metalLibPath: String? = nil
+    
+    /// 输入维度，按照 n h w c 方式传入
+    @objc public var inputDim: Dim = Dim.init(inDim: [])
+    
+    
+    @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) {
+        self.paramPointer = inParamPointer
+        self.paramSize = inParamSize
+        self.modelPointer = inModelPointer
+        self.modelSize = inModelSize
+        self.device = device
+        super.init()
+    }
+    
+    @objc public init(device: MTLDevice) {
+        self.device = device
+        super.init()
+    }
+    
+    @objc open func resultStr(res: [ResultHolder]) -> String {
         fatalError()
-      }
-      return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim)
     }
-  }
-  
-  open func updateProgram(program: Program) {
-  }
-  
+    
+    @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] {
+        return paddleMobileRes.map { (gpuRes) -> ResultHolder in
+            guard let inResPointer = gpuRes.resultPointer else {
+                fatalError()
+            }
+            return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim)
+        }
+    }
+    
+    open func updateProgram(program: Program) {
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/API/Runner.swift b/metal/paddle-mobile/paddle-mobile/API/Runner.swift
index 2d7bf9d190fcc99dd3932915d6a523098b94b3f7..d6c30805ebd1bf408efcf56ec5e1905b3beb9567 100644
--- a/metal/paddle-mobile/paddle-mobile/API/Runner.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/Runner.swift
@@ -16,187 +16,187 @@ import MetalKit
 import Foundation
 
 @objc public class ResultHolder: NSObject {
-  @objc public let result: UnsafeMutablePointer<Float32>
-  @objc public let capacity: Int
-  @objc public let dim: [Int]
-  
-  init(inResult: UnsafeMutablePointer<Float32>, inCapacity: Int, inDim: [Int]) {
-    result = inResult
-    capacity = inCapacity
-    dim = inDim
-  }
-  
-  @objc public func releasePointer() {
-    result.deinitialize(count: capacity)
-    result.deallocate()
-  }
+    @objc public let result: UnsafeMutablePointer<Float32>
+    @objc public let capacity: Int
+    @objc public let dim: [Int]
+    
+    init(inResult: UnsafeMutablePointer<Float32>, inCapacity: Int, inDim: [Int]) {
+        result = inResult
+        capacity = inCapacity
+        dim = inDim
+    }
+    
+    @objc public func releasePointer() {
+        result.deinitialize(count: capacity)
+        result.deallocate()
+    }
 }
 
 @objc public class Runner: NSObject {
-  var program: Program?
-  var executor: Executor<Float32>?
-  var queue: MTLCommandQueue?
-  var textureLoader: MTKTextureLoader?
-  public let net: Net
-  let device: MTLDevice?
-  let numel: Int
-  
-  /// 初始化函数
-  ///
-  /// - Parameters:
-  ///   - inNet: 传入自定义的网络
-  ///   - commandQueue: commandQueue
-  @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) {
-    guard inNet.inputDim.cout() == 4 else {
-      fatalError(" input dim count must 4 ")
-    }
+    var program: Program?
+    var executor: Executor<Float32>?
+    var queue: MTLCommandQueue?
+    var textureLoader: MTKTextureLoader?
+    public let net: Net
+    let device: MTLDevice?
+    let numel: Int
     
-    net = inNet
-    queue = commandQueue
-    device = queue?.device
-    if let inDevice = device {
-      textureLoader = MTKTextureLoader.init(device: inDevice)
-    }
-    numel = net.inputDim.numel()
-  }
-  
-  /// load 模型, 返回 true 可进行预测
-  ///
-  /// - Returns: load 成功或失败
-  @objc public func load() -> Bool {
-      guard let inDevice = device, let inQueue = queue else {
-        print(" paddle mobile gpu load error, need MTLCommandQueue")
-        return false
-      }
-      let loader = Loader<Float32>.init()
-      do {
-        
-        if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer {
-          guard net.paramSize > 0 && net.modelSize > 0 else {
-            print(" load from memory param size or model size can't 0 ")
-            return false
-          }
-          program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize)
-        } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath {
-          program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath)
-        } else {
-          print(" model pointer or model file path need be specified")
-          return false
+    /// 初始化函数
+    ///
+    /// - Parameters:
+    ///   - inNet: 传入自定义的网络
+    ///   - commandQueue: commandQueue
+    @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) {
+        guard inNet.inputDim.cout() == 4 else {
+            fatalError(" input dim count must 4 ")
         }
         
-        let initContext: InitContext = InitContext.init()
-        initContext.metalLoadMode = net.metalLoadMode
-        initContext.metalLibPath = net.metalLibPath
-        executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext)
-        net.updateProgram(program: program!)
-      } catch let error {
-        print(error)
-        return false
-      }
-    return true
-  }
-  
-  /// 预测
-  ///
-  /// - Parameters:
-  ///   - texture: 输入 texture 需要使用 getTexture 获得
-  ///   - completion: 结果回调， 当 success 为 true 时 result 不为 nil
-  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) {
-    do {
-      try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in
-        guard let SSelf = self else {
-          fatalError( " self nil " )
+        net = inNet
+        queue = commandQueue
+        device = queue?.device
+        if let inDevice = device {
+            textureLoader = MTKTextureLoader.init(device: inDevice)
         }
-        let result = SSelf.net.fetchResult(paddleMobileRes: res)
-        completion(true, result)
-        }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
-    } catch let error {
-      print(error)
-      completion(false, nil)
-      return
-    }
-  }
-  
-  /// 清理内存, 调用此函数后, 不能再使用, 需重新 load
-  @objc public func clear() {
-    executor?.clear()
-    executor = nil
-    program = nil
-  }
-  
-  /// 获取 texture, 对 texture 进行预处理, 预测时使用
-  ///
-  /// - Parameters:
-  ///   - image: 输入图像
-  ///   - getTexture: 获取 texture 回调
-  @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
-    let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
-    scaleTexture(input: texture!, complete: getTexture)
-  }
-  
-  /// 通过 buffer 获取 texture， 内部会使用GPU进行转换操作
-  ///
-  /// - Parameters:
-  ///   - inBuffer: 输入buffer
-  ///   - getTexture: 结果回调
-  @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) {
-    guard let inQueue = queue, let inDevice = device else {
-      fatalError( " queue or devcie nil " )
+        numel = net.inputDim.numel()
     }
     
-    guard let buffer = inQueue.makeCommandBuffer() else {
-      fatalError( " make buffer error" )
-    }
-    
-    let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
-    do {
-      try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer)
-    } catch {
-      fatalError(" bufferToTextureKernel error ")
+    /// load 模型, 返回 true 可进行预测
+    ///
+    /// - Returns: load 成功或失败
+    @objc public func load() -> Bool {
+        guard let inDevice = device, let inQueue = queue else {
+            print(" paddle mobile gpu load error, need MTLCommandQueue")
+            return false
+        }
+        let loader = Loader<Float32>.init()
+        do {
+            
+            if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer {
+                guard net.paramSize > 0 && net.modelSize > 0 else {
+                    print(" load from memory param size or model size can't 0 ")
+                    return false
+                }
+                program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize)
+            } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath {
+                program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath)
+            } else {
+                print(" model pointer or model file path need be specified")
+                return false
+            }
+            
+            let initContext: InitContext = InitContext.init()
+            initContext.metalLoadMode = net.metalLoadMode
+            initContext.metalLibPath = net.metalLibPath
+            executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext)
+            net.updateProgram(program: program!)
+        } catch let error {
+            print(error)
+            return false
+        }
+        return true
     }
     
-    buffer.addCompletedHandler { (buffer) in
-      getTexture(bufferToTextureKernel.outputTexture)
+    /// 预测
+    ///
+    /// - Parameters:
+    ///   - texture: 输入 texture 需要使用 getTexture 获得
+    ///   - completion: 结果回调， 当 success 为 true 时 result 不为 nil
+    @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) {
+        do {
+            try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in
+                guard let SSelf = self else {
+                    fatalError( " self nil " )
+                }
+                let result = SSelf.net.fetchResult(paddleMobileRes: res)
+                completion(true, result)
+                }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
+        } catch let error {
+            print(error)
+            completion(false, nil)
+            return
+        }
     }
     
-    buffer.commit()
-  }
-
-  /// 更新输入维度， 针对可变长输入模型
-  ///
-  /// - Parameter inDim: 输入维度
-  @objc public func updateInputDim(inDim: Dim) {
-    if net.inputDim != inDim {
-      guard let inProgram = program else {
-        fatalError(" need load first ")
-      }
-      net.inputDim = inDim
-      net.updateProgram(program: inProgram)
+    /// 清理内存, 调用此函数后, 不能再使用, 需重新 load
+    @objc public func clear() {
+        executor?.clear()
+        executor = nil
+        program = nil
     }
-  }
-  
-  public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
     
-    guard let inQueue = queue, let inDevice = device else {
-      fatalError( " queue or devcie nil " )
+    /// 获取 texture, 对 texture 进行预处理, 预测时使用
+    ///
+    /// - Parameters:
+    ///   - image: 输入图像
+    ///   - getTexture: 获取 texture 回调
+    @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
+        let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
+        scaleTexture(input: texture!, complete: getTexture)
     }
     
-    guard let buffer = inQueue.makeCommandBuffer() else {
-      fatalError( " make buffer error" )
+    /// 通过 buffer 获取 texture， 内部会使用GPU进行转换操作
+    ///
+    /// - Parameters:
+    ///   - inBuffer: 输入buffer
+    ///   - getTexture: 结果回调
+    @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) {
+        guard let inQueue = queue, let inDevice = device else {
+            fatalError( " queue or devcie nil " )
+        }
+        
+        guard let buffer = inQueue.makeCommandBuffer() else {
+            fatalError( " make buffer error" )
+        }
+        
+        let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
+        do {
+            try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer)
+        } catch {
+            fatalError(" bufferToTextureKernel error ")
+        }
+        
+        buffer.addCompletedHandler { (buffer) in
+            getTexture(bufferToTextureKernel.outputTexture)
+        }
+        
+        buffer.commit()
     }
     
-    let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
-    
-    do {
-      try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
-    } catch let error {
-      print(error)
-      fatalError()
+    /// 更新输入维度， 针对可变长输入模型
+    ///
+    /// - Parameter inDim: 输入维度
+    @objc public func updateInputDim(inDim: Dim) {
+        if net.inputDim != inDim {
+            guard let inProgram = program else {
+                fatalError(" need load first ")
+            }
+            net.inputDim = inDim
+            net.updateProgram(program: inProgram)
+        }
     }
     
-    buffer.addCompletedHandler { (buffer) in
-      complete(scaleKernel.outputTexture)
+    public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
+        
+        guard let inQueue = queue, let inDevice = device else {
+            fatalError( " queue or devcie nil " )
+        }
+        
+        guard let buffer = inQueue.makeCommandBuffer() else {
+            fatalError( " make buffer error" )
+        }
+        
+        let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
+        
+        do {
+            try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
+        } catch let error {
+            print(error)
+            fatalError()
+        }
+        
+        buffer.addCompletedHandler { (buffer) in
+            complete(scaleKernel.outputTexture)
+        }
+        buffer.commit()
     }
-    buffer.commit()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
index 12bc909be97b372ce9f82daf035dced0b969cdc7..64786d0a45fde417021fc468e5526076ca760753 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift
@@ -16,128 +16,128 @@ import Foundation
 
 // 自定义 ?!  如果 ?! 前的返回值为一个可选值, 则进行隐式解包, 如果有值则返回这个值, 如果为nil 则fatalError 传入的信息
 precedencegroup ExecutedOrFatalError{
-  associativity: left
-  higherThan: AssignmentPrecedence
+    associativity: left
+    higherThan: AssignmentPrecedence
 }
 infix operator ?!: ExecutedOrFatalError
 public func ?!<T>(option: T?, excuteOrError: @autoclosure () -> String) -> T{
-  if let inOpt = option {
-    return inOpt
-  }else{
-    print(excuteOrError())
-    fatalError(excuteOrError())
-  }
+    if let inOpt = option {
+        return inOpt
+    }else{
+        print(excuteOrError())
+        fatalError(excuteOrError())
+    }
 }
 
 //Lense
 struct Lense<A, B> {
-  let from: (A) -> B
-  let to: (B, A) -> A
+    let from: (A) -> B
+    let to: (B, A) -> A
 }
 
 precedencegroup CombineLense{
-  associativity: left
-  higherThan: AssignmentPrecedence
+    associativity: left
+    higherThan: AssignmentPrecedence
 }
 
 infix operator >>>: CombineLense
 func >>><A, B, C>(left: Lense<B, C>, right: Lense<A, B>) -> Lense<A, C> {
-  return Lense<A, C>.init(from: { (a) -> C in
-    left.from(right.from(a))
-  }, to: { (c, a) -> A in
-    right.to( left.to(c, right.from(a)),a)
-  })
+    return Lense<A, C>.init(from: { (a) -> C in
+        left.from(right.from(a))
+    }, to: { (c, a) -> A in
+        right.to( left.to(c, right.from(a)),a)
+    })
 }
 
 protocol CIntIndex {
-  associatedtype T;
-  subscript(index: CInt) -> T { get set};
+    associatedtype T;
+    subscript(index: CInt) -> T { get set};
 }
 
 extension Array: CIntIndex{
-  typealias T = Element
-  subscript(index: CInt) -> T {
-    get{
-      guard Int64(Int.max) >= Int64(index) else{
-        fatalError("cint index out of Int range")
-      }
-      return self[Int(index)]
-    }
-    set{
-      guard Int64(Int.max) >= Int64(index) else{
-        fatalError("cint index out of Int range")
-      }
-      self[Int(index)] = newValue
+    typealias T = Element
+    subscript(index: CInt) -> T {
+        get{
+            guard Int64(Int.max) >= Int64(index) else{
+                fatalError("cint index out of Int range")
+            }
+            return self[Int(index)]
+        }
+        set{
+            guard Int64(Int.max) >= Int64(index) else{
+                fatalError("cint index out of Int range")
+            }
+            self[Int(index)] = newValue
+        }
+        
     }
-    
-  }
 }
 
 extension Array where Element: AnyObject{
-  mutating func remove(element: Element) {
-    if let index = index(where: { (node) -> Bool in
-      return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
-    }) {
-      remove(at: index)
+    mutating func remove(element: Element) {
+        if let index = index(where: { (node) -> Bool in
+            return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
+        }) {
+            remove(at: index)
+        }
     }
-  }
-  
+    
 }
 
 //MARK: Array extension
 extension Array where Element: Comparable{
-  
-  /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
-  ///
-  /// - Parameter r: 前 r 个元素
-  /// - Returns: [(原有位置, 排好位置的元素)]
-  public func top(r: Int) -> [(Int, Element)] {
-    precondition(r <= self.count)
-    return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
-  }
+    
+    /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
+    ///
+    /// - Parameter r: 前 r 个元素
+    /// - Returns: [(原有位置, 排好位置的元素)]
+    public func top(r: Int) -> [(Int, Element)] {
+        precondition(r <= self.count)
+        return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
+    }
 }
 
 extension Array {
-  public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
-    if count < inCount {
-      return (0..<count).map{ ($0, self[$0]) }
-    } else {
-      let stride = count / inCount
-      var newArray: [(Int, Element)] = []
-      for i in 0..<inCount {
-        newArray.append((i * stride, self[i * stride]))
-      }
-      return newArray
+    public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
+        if count < inCount {
+            return (0..<count).map{ ($0, self[$0]) }
+        } else {
+            let stride = count / inCount
+            var newArray: [(Int, Element)] = []
+            for i in 0..<inCount {
+                newArray.append((i * stride, self[i * stride]))
+            }
+            return newArray
+        }
     }
-  }
-  
-  public static func floatArrWithBuffer(floatArrBuffer: UnsafeMutablePointer<Float32>, count: Int) -> [Float32] {
-    var arr: [Float32] = []
-    for i in 0..<count {
-      arr.append(floatArrBuffer[i])
+    
+    public static func floatArrWithBuffer(floatArrBuffer: UnsafeMutablePointer<Float32>, count: Int) -> [Float32] {
+        var arr: [Float32] = []
+        for i in 0..<count {
+            arr.append(floatArrBuffer[i])
+        }
+        return arr
     }
-    return arr
-  }
 }
 
 extension UnsafeMutablePointer {
-  public func floatArr(count: Int) -> [Pointee]{
-    var arr: [Pointee] = []
-    for i in 0..<count {
-      arr.append(self[i])
+    public func floatArr(count: Int) -> [Pointee]{
+        var arr: [Pointee] = []
+        for i in 0..<count {
+            arr.append(self[i])
+        }
+        return arr
     }
-    return arr
-  }
 }
 
 extension String {
-  func cStr() -> UnsafePointer<Int8>? {
-    return (self as NSString).utf8String
-  }
+    func cStr() -> UnsafePointer<Int8>? {
+        return (self as NSString).utf8String
+    }
 }
 
 func address<T: AnyObject>(o: T) -> String {
-  return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
+    return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
index c3ba777b2740fd806be450b0e373fb37a95249e6..35fffb52ec2364adda2c9a9cd70b7d7ff3cf6f2e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift
@@ -21,615 +21,615 @@ fileprivate var paddleMobileMetalLibrary: MTLLibrary?
 fileprivate var customMetalLibrary: MTLLibrary?
 
 extension MTLDevice {
-  func defaultLibrary() -> MTLLibrary {
-    if defaultMetalLibrary == nil {
-      defaultMetalLibrary = makeDefaultLibrary()
-    }
-    if let inDefaultLib = defaultMetalLibrary {
-      return inDefaultLib
-    } else {
-      fatalError(" default metal libary is nil")
-    }
-  }
-  
-  func customLibrary(metalLibPath: String) -> MTLLibrary {
-    if customMetalLibrary == nil {
-      do {
-        customMetalLibrary = try makeLibrary(filepath: metalLibPath)
-      } catch  let error {
-        fatalError("\(error)")
-      }
+    func defaultLibrary() -> MTLLibrary {
+        if defaultMetalLibrary == nil {
+            defaultMetalLibrary = makeDefaultLibrary()
+        }
+        if let inDefaultLib = defaultMetalLibrary {
+            return inDefaultLib
+        } else {
+            fatalError(" default metal libary is nil")
+        }
     }
     
-    if let inMetalLib = customMetalLibrary {
-      return inMetalLib
-    } else {
-      fatalError(" customlib is nil ")
-    }
-  }
-  
-  func paddleMobileLibrary() -> MTLLibrary {
-    if paddleMobileMetalLibrary == nil {
-      guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
-        fatalError("Counld't find paddle mobile library")
-      }
-      do {
-        paddleMobileMetalLibrary = try makeLibrary(filepath: path)
-      } catch _ {
-        fatalError("Counld't load paddle mobile library")
-      }
+    func customLibrary(metalLibPath: String) -> MTLLibrary {
+        if customMetalLibrary == nil {
+            do {
+                customMetalLibrary = try makeLibrary(filepath: metalLibPath)
+            } catch  let error {
+                fatalError("\(error)")
+            }
+        }
+        
+        if let inMetalLib = customMetalLibrary {
+            return inMetalLib
+        } else {
+            fatalError(" customlib is nil ")
+        }
     }
     
-    if let inPaddleMobileLib = paddleMobileMetalLibrary {
-      return inPaddleMobileLib
-    } else {
-      fatalError("PaddleMobile metal libary is nil")
-    }
-  }
-  
-  func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState {
-    let useLib: MTLLibrary
-    switch metalLoadMode {
-    case .LoadMetalInDefaultLib:
-      useLib = defaultLibrary()
-    case .LoadMetalInPaddleMobile:
-      useLib = paddleMobileLibrary()
-    case .LoadMetalInCustomMetalLib:
-      useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ")
-    default:
-      fatalError()
+    func paddleMobileLibrary() -> MTLLibrary {
+        if paddleMobileMetalLibrary == nil {
+            guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
+                fatalError("Counld't find paddle mobile library")
+            }
+            do {
+                paddleMobileMetalLibrary = try makeLibrary(filepath: path)
+            } catch _ {
+                fatalError("Counld't load paddle mobile library")
+            }
+        }
+        
+        if let inPaddleMobileLib = paddleMobileMetalLibrary {
+            return inPaddleMobileLib
+        } else {
+            fatalError("PaddleMobile metal libary is nil")
+        }
     }
     
-    guard let function = useLib.makeFunction(name: funcName) else {
-      fatalError(" function " + funcName + " not found")
-    }
-    do {
-      let pipLine = try makeComputePipelineState(function: function)
-      return pipLine
-    } catch let error {
-      print(error)
-      fatalError("make pip line error occured : \(error)")
+    func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState {
+        let useLib: MTLLibrary
+        switch metalLoadMode {
+        case .LoadMetalInDefaultLib:
+            useLib = defaultLibrary()
+        case .LoadMetalInPaddleMobile:
+            useLib = paddleMobileLibrary()
+        case .LoadMetalInCustomMetalLib:
+            useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ")
+        default:
+            fatalError()
+        }
+        
+        guard let function = useLib.makeFunction(name: funcName) else {
+            fatalError(" function " + funcName + " not found")
+        }
+        do {
+            let pipLine = try makeComputePipelineState(function: function)
+            return pipLine
+        } catch let error {
+            print(error)
+            fatalError("make pip line error occured : \(error)")
+        }
+        
     }
     
-  }
-  
-  func makeBuffer<P>(value: [P]) -> MTLBuffer {
-    let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
-    let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
-    for i in 0..<value.count {
-      contents?[i] = value[i]
-    }
-    return buffer!
-  }
-  
-  func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
-    let bpR = texture.width * 4 * MemoryLayout<P>.size
-    let bpI = texture.height * bpR
-    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
-    for i in 0..<texture.arrayLength {
-      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
-      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
-      for tx in 0..<texture.width * texture.height * 4 {
-        var k = tx
-        var xyzn: [Int] = [0, 0, 0, 0]
-        xyzn[1] = k / (texture.width * 4)
-        k %= (texture.width * 4)
-        xyzn[3] = k % 4
-        xyzn[0] = k / 4
-        xyzn[2] = i
-        cb(xyzn, pointer[tx])
-      }
-    }
-  }
-  
-  func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
-    }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
-    assert(dim.count == 3)
-    assert(texture.width == ndim[3])
-    assert(texture.height == ndim[2])
-    assert(ndim[0] == 1)
-    assert(texture.arrayLength == (ndim[1] + 3) / 4)
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[1] = xyzn[2] * 4 + xyzn[3]
-      tg[2] = xyzn[1]
-      tg[3] = xyzn[0]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func makeBuffer<P>(value: [P]) -> MTLBuffer {
+        let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
+        let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
+        for i in 0..<value.count {
+            contents?[i] = value[i]
+        }
+        return buffer!
     }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
-    assert(dim.count == 2)
-    let w = (ndim[3] + 3) / 4
-    assert(texture.width == w)
-    assert(texture.height == ndim[2])
-    assert(ndim[0] == 1)
-    assert(ndim[1] == 1)
-    assert(texture.arrayLength == 1)
     
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[2] = xyzn[1]
-      tg[3] = xyzn[0] * 4 + xyzn[3]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
+        let bpR = texture.width * 4 * MemoryLayout<P>.size
+        let bpI = texture.height * bpR
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
+        for i in 0..<texture.arrayLength {
+            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
+            texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+            for tx in 0..<texture.width * texture.height * 4 {
+                var k = tx
+                var xyzn: [Int] = [0, 0, 0, 0]
+                xyzn[1] = k / (texture.width * 4)
+                k %= (texture.width * 4)
+                xyzn[3] = k % 4
+                xyzn[0] = k / 4
+                xyzn[2] = i
+                cb(xyzn, pointer[tx])
+            }
+        }
     }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
-    assert(dim.count == 1)
-    let w = (ndim[3] + 3) / 4
-    assert(texture.width == w)
-    assert(texture.height == 1)
-    assert(ndim[0] == 1)
-    assert(ndim[1] == 1)
-    assert(ndim[2] == 1)
-    assert(texture.arrayLength == 1)
     
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[3] = xyzn[0] * 4 + xyzn[3]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
-    if dim.count == 3 {
-      return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
-    } else if dim.count == 2 {
-      return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
-    } else if dim.count == 1 {
-      return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
-    }
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        assert(dim.count == 3)
+        assert(texture.width == ndim[3])
+        assert(texture.height == ndim[2])
+        assert(ndim[0] == 1)
+        assert(texture.arrayLength == (ndim[1] + 3) / 4)
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[1] = xyzn[2] * 4 + xyzn[3]
+            tg[2] = xyzn[1]
+            tg[3] = xyzn[0]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
-    let count = dim.reduce(1) { $0 * $1 }
-    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
-    let ndim: [Int] = transpose.map { tdim[$0] }
     
-    assert(texture.width == ndim[2])
-    assert(texture.height == ndim[1])
-    assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
-    
-    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-      var tg: [Int] = [0, 0, 0, 0]
-      tg[1] = xyzn[1]
-      tg[2] = xyzn[0]
-      tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
-      tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
-      var ig: [Int] = [0, 0, 0, 0]
-      for k in 0..<4 {
-        ig[transpose[k]] = tg[k]
-      }
-      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
-      if ix < count {
-        tensor[ix] = v
-      }
-    }
-    return tensor
-  }
-  
-  func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
-    if value.count > 0 {
-      assert(value.count == dim.reduce(1) { $0 * $1 })
+    func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        assert(dim.count == 2)
+        let w = (ndim[3] + 3) / 4
+        assert(texture.width == w)
+        assert(texture.height == ndim[2])
+        assert(ndim[0] == 1)
+        assert(ndim[1] == 1)
+        assert(texture.arrayLength == 1)
+        
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[2] = xyzn[1]
+            tg[3] = xyzn[0] * 4 + xyzn[3]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
     
-    var tdim: [Int] = [1, 1, 1, 1]
-    for i in 0..<dim.count {
-      tdim[4 - dim.count + i] = dim[i]
+    func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        assert(dim.count == 1)
+        let w = (ndim[3] + 3) / 4
+        assert(texture.width == w)
+        assert(texture.height == 1)
+        assert(ndim[0] == 1)
+        assert(ndim[1] == 1)
+        assert(ndim[2] == 1)
+        assert(texture.arrayLength == 1)
+        
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[3] = xyzn[0] * 4 + xyzn[3]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
-    let ndim: [Int] = transpose.map { tdim[$0] }
     
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.width = ndim[2]
-    textureDesc.height = ndim[1]
-    textureDesc.depth = 1
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    
-    if inComputePrecision == .Float16 {
-      textureDesc.pixelFormat = .rgba16Float
-    } else if inComputePrecision == .Float32 {
-      textureDesc.pixelFormat = .rgba32Float
+    func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        if dim.count == 3 {
+            return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
+        } else if dim.count == 2 {
+            return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
+        } else if dim.count == 1 {
+            return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
+        }
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        
+        assert(texture.width == ndim[2])
+        assert(texture.height == ndim[1])
+        assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
+        
+        texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+            var tg: [Int] = [0, 0, 0, 0]
+            tg[1] = xyzn[1]
+            tg[2] = xyzn[0]
+            tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
+            tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
+            var ig: [Int] = [0, 0, 0, 0]
+            for k in 0..<4 {
+                ig[transpose[k]] = tg[k]
+            }
+            let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+            if ix < count {
+                tensor[ix] = v
+            }
+        }
+        return tensor
     }
     
-    textureDesc.textureType = .type2DArray
-    textureDesc.storageMode = .shared
-    textureDesc.cpuCacheMode = .defaultCache
-    textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
-    let texture = makeTexture(descriptor: textureDesc)!
-    
-    if value.count > 0 {
-      var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
-      rcount = rcount * 4 * ndim[1] * ndim[2]
-      var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
-      
-      for i0 in 0..<tdim[0] {
-        for i1 in 0..<tdim[1] {
-          for i2 in 0..<tdim[2] {
-            for i3 in 0..<tdim[3] {
-              let ig = [i0, i1, i2, i3]
-              let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
-              
-              let jg = transpose.map { ig[$0] }
-              let k = jg[0] * ndim[3] + jg[3]
-              let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
-              
-              nvalue[jx] = value[ix] as! Float32
-            }
-          }
-        }
-      }
-      
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
-      if inComputePrecision == .Float16 {
-        let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
-        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
-        let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
-        float32ToFloat16(input: pointer, output: outputP, count: rcount)
-        let bpR = ndim[2] * 4 * 2
-        let bpI = ndim[1] * bpR
-        for i in 0..<textureDesc.arrayLength {
-          let p = outputP + texture.width * texture.height * 4 * i
-          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
-        }
-      } else {
-        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
-        let bpR = ndim[2] * 4 * MemoryLayout<P>.size
-        let bpI = ndim[1] * bpR
-        for i in 0..<textureDesc.arrayLength {
-          let p = pointer + texture.width * texture.height * 4 * i
-          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
-        }
-      }
+    func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
+        if value.count > 0 {
+            assert(value.count == dim.reduce(1) { $0 * $1 })
+        }
+        
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.width = ndim[2]
+        textureDesc.height = ndim[1]
+        textureDesc.depth = 1
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        
+        if inComputePrecision == .Float16 {
+            textureDesc.pixelFormat = .rgba16Float
+        } else if inComputePrecision == .Float32 {
+            textureDesc.pixelFormat = .rgba32Float
+        }
+        
+        textureDesc.textureType = .type2DArray
+        textureDesc.storageMode = .shared
+        textureDesc.cpuCacheMode = .defaultCache
+        textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
+        let texture = makeTexture(descriptor: textureDesc)!
+        
+        if value.count > 0 {
+            var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
+            rcount = rcount * 4 * ndim[1] * ndim[2]
+            var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
+            
+            for i0 in 0..<tdim[0] {
+                for i1 in 0..<tdim[1] {
+                    for i2 in 0..<tdim[2] {
+                        for i3 in 0..<tdim[3] {
+                            let ig = [i0, i1, i2, i3]
+                            let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
+                            
+                            let jg = transpose.map { ig[$0] }
+                            let k = jg[0] * ndim[3] + jg[3]
+                            let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
+                            
+                            nvalue[jx] = value[ix] as! Float32
+                        }
+                    }
+                }
+            }
+            
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+            if inComputePrecision == .Float16 {
+                let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
+                let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+                let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
+                float32ToFloat16(input: pointer, output: outputP, count: rcount)
+                let bpR = ndim[2] * 4 * 2
+                let bpI = ndim[1] * bpR
+                for i in 0..<textureDesc.arrayLength {
+                    let p = outputP + texture.width * texture.height * 4 * i
+                    texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+                }
+            } else {
+                let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+                let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+                let bpI = ndim[1] * bpR
+                for i in 0..<textureDesc.arrayLength {
+                    let p = pointer + texture.width * texture.height * 4 * i
+                    texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+                }
+            }
+        }
+        return texture
     }
-    return texture
-  }
-  
-  func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
-    
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.width = textureWidth
-    textureDesc.height = textureHeight
-    textureDesc.depth = 1
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.pixelFormat = .rgba32Float
-    textureDesc.textureType = .type2DArray
-    textureDesc.storageMode = .shared
-    textureDesc.cpuCacheMode = .defaultCache
-    textureDesc.arrayLength = arrayLength
-    let texture = makeTexture(descriptor: textureDesc)!
     
-    if value.count >= 4{
-      let counts = arrayLength * 4 * textureWidth * textureHeight
-      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
-      for i in 0..<value.count {
-        pointer[i] = value[i]
-      }
-      for i in value.count..<counts {
-        pointer[i] = 0 as! P
-      }
-      
-      let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
-      let bytesPerImage = texture.height * bytesPerRow
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
-      for i in 0..<arrayLength {
-        let p = pointer + texture.width * texture.height * 4 * i
-        texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
-      }
-    } else {
-      
+    func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
+        
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.width = textureWidth
+        textureDesc.height = textureHeight
+        textureDesc.depth = 1
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.pixelFormat = .rgba32Float
+        textureDesc.textureType = .type2DArray
+        textureDesc.storageMode = .shared
+        textureDesc.cpuCacheMode = .defaultCache
+        textureDesc.arrayLength = arrayLength
+        let texture = makeTexture(descriptor: textureDesc)!
+        
+        if value.count >= 4{
+            let counts = arrayLength * 4 * textureWidth * textureHeight
+            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
+            for i in 0..<value.count {
+                pointer[i] = value[i]
+            }
+            for i in value.count..<counts {
+                pointer[i] = 0 as! P
+            }
+            
+            let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
+            let bytesPerImage = texture.height * bytesPerRow
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
+            for i in 0..<arrayLength {
+                let p = pointer + texture.width * texture.height * 4 * i
+                texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
+            }
+        } else {
+            
+        }
+        
+        return texture
     }
-    
-    return texture
-  }
 }
 
 extension MTLComputeCommandEncoder {
-  public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
-    let slices = (outTexture.arrayLength * 4 + 3)/4
-    
-    let width = computePipline.threadExecutionWidth
-    let height = computePipline.maxTotalThreadsPerThreadgroup/width
-    let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
-    
-//    print(" thread: threads per group: \(threadsPerGroup) ")
-//    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
-    
-    let groupWidth = (outTexture.width + width - 1)/width
-    let groupHeight = (outTexture.height + height - 1)/height
-    let groupDepth = slices
-    let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
-    
-    setComputePipelineState(computePipline)
-    
-    dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
-  }
+    public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
+        let slices = (outTexture.arrayLength * 4 + 3)/4
+        
+        let width = computePipline.threadExecutionWidth
+        let height = computePipline.maxTotalThreadsPerThreadgroup/width
+        let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
+        
+        //    print(" thread: threads per group: \(threadsPerGroup) ")
+        //    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
+        
+        let groupWidth = (outTexture.width + width - 1)/width
+        let groupHeight = (outTexture.height + height - 1)/height
+        let groupDepth = slices
+        let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
+        
+        setComputePipelineState(computePipline)
+        
+        dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
+    }
 }
 
 public extension MTLTexture {
-  
-  func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
-    var arr: [P] = floatArray { (p: P) -> P in
-      return p;
-    }
-    var result:  [(index: Int, value: P)] = []
-    if arr.count > 100 && stridable {
-      for j in stride(from: 0, to: arr.count , by: arr.count / 100){
-        result.append((j, arr[j]))
-      }
-    } else {
-      for j in 0..<arr.count {
-        result.append((j, arr[j]))
-      }
-    }
-    return result
-  }
-  
-  func floatArray<P, T>(res: (P) -> T) -> [T] {
-    var fArr: [T] = []
-    if textureType == .type2DArray {
-      for i in 0..<arrayLength{
-        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-        let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-        let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
-        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-        let p = bytes.assumingMemoryBound(to: P.self)
-        
-        for j in 0..<width * height * depth * 4 {
-          fArr.append(res(p[j]))
-        }
-        bytes.deallocate()
-      }
-    } else if textureType == .type2D {
-      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-      let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-      let p = bytes.assumingMemoryBound(to: P.self)
-      
-      for j in 0..<width * height * 4 {
-        fArr.append(res(p[j]))
-      }
-      bytes.deallocate()
+    
+    func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
+        var arr: [P] = floatArray { (p: P) -> P in
+            return p;
+        }
+        var result:  [(index: Int, value: P)] = []
+        if arr.count > 100 && stridable {
+            for j in stride(from: 0, to: arr.count , by: arr.count / 100){
+                result.append((j, arr[j]))
+            }
+        } else {
+            for j in 0..<arr.count {
+                result.append((j, arr[j]))
+            }
+        }
+        return result
     }
-    return fArr
-  }
-  
-  func float32Array() -> [Float32] {
-    if pixelFormat == .rgba32Float {
-      let float32Array = floatArray { (f: Float32) -> Float32 in
-        return f
-      }
-      return float32Array
-    } else if pixelFormat == .rgba16Float {
-      
-      var float16Array = floatArray { (f: Float16) -> Float16 in
-        return f
-      }
-      return float16To32(input: &float16Array, count: float16Array.count)
-    } else {
-      fatalError()
+    
+    func floatArray<P, T>(res: (P) -> T) -> [T] {
+        var fArr: [T] = []
+        if textureType == .type2DArray {
+            for i in 0..<arrayLength{
+                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+                let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+                let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
+                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+                let p = bytes.assumingMemoryBound(to: P.self)
+                
+                for j in 0..<width * height * depth * 4 {
+                    fArr.append(res(p[j]))
+                }
+                bytes.deallocate()
+            }
+        } else if textureType == .type2D {
+            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+            let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+            let p = bytes.assumingMemoryBound(to: P.self)
+            
+            for j in 0..<width * height * 4 {
+                fArr.append(res(p[j]))
+            }
+            bytes.deallocate()
+        }
+        return fArr
     }
-  }
-  
-  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-    print(header)
-    print("texture: \(self)")
-    //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
-    //        print(res)
     
-    if textureType == .type2DArray {
-      for i in 0..<arrayLength{
-        var str: String = "slice: \(i): \n"
-        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-        let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-        let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
-        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-        let p = bytes.assumingMemoryBound(to: T.self)
-        str += "2d array count : \(width * height * depth * 4) \n"
-        if stridable && width * height * depth * 4 > 20 {
-          for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
-            str += " index \(j): \(p[j])"
-          }
+    func float32Array() -> [Float32] {
+        if pixelFormat == .rgba32Float {
+            let float32Array = floatArray { (f: Float32) -> Float32 in
+                return f
+            }
+            return float32Array
+        } else if pixelFormat == .rgba16Float {
+            
+            var float16Array = floatArray { (f: Float16) -> Float16 in
+                return f
+            }
+            return float16To32(input: &float16Array, count: float16Array.count)
         } else {
-          for j in 0..<width * height * depth * 4 {
-            str += " index \(j): \(p[j])"
-          }
+            fatalError()
         }
-        
-        bytes.deallocate()
-        print(str)
-      }
-    } else if textureType == .type2D {
-      var str: String = "texture 2D: "
-      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-      let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-      let p = bytes.assumingMemoryBound(to: T.self)
-      str += "2d count : \(width * width * 4) \n"
-      
-      if stridable {
-        for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
-          str += "index \(j): \(p[j]) "
-        }
-      } else {
-        for j in 0..<width * height * 4 {
-          str += "index \(j): \(p[j]) "
-        }
-      }
-      
-      print(str)
-      bytes.deallocate()
     }
-    return nil
     
-  }
-  
-  // n c h w - dim
-  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
-    var textureArray: [Float32]
-    if pixelFormat == .rgba32Float {
-      textureArray = floatArray { (i : Float32) -> Float32 in
-        return i
-      }
-    } else if pixelFormat == .rgba16Float {
-      
-      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
-        return i
-      }
-      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
-    } else {
-      fatalError(" 目前还不支持其他类型 ")
-    }
-    print(textureArray.count)
-    var output: [Float32] = []
-    for s in 0..<arrayLength {
-      for c in 0..<4{
-        for h in 0..<dim.h {
-          for w in 0..<dim.w {
-            if (s * 4 + c) < dim.c {
-              let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
-              output.append(textureValue)
+    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+        print(header)
+        print("texture: \(self)")
+        //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
+        //        print(res)
+        
+        if textureType == .type2DArray {
+            for i in 0..<arrayLength{
+                var str: String = "slice: \(i): \n"
+                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
+                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+                let p = bytes.assumingMemoryBound(to: T.self)
+                str += "2d array count : \(width * height * depth * 4) \n"
+                if stridable && width * height * depth * 4 > 20 {
+                    for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
+                        str += " index \(j): \(p[j])"
+                    }
+                } else {
+                    for j in 0..<width * height * depth * 4 {
+                        str += " index \(j): \(p[j])"
+                    }
+                }
+                
+                bytes.deallocate()
+                print(str)
             }
-          }
+        } else if textureType == .type2D {
+            var str: String = "texture 2D: "
+            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+            let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+            let p = bytes.assumingMemoryBound(to: T.self)
+            str += "2d count : \(width * width * 4) \n"
+            
+            if stridable {
+                for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
+                    str += "index \(j): \(p[j]) "
+                }
+            } else {
+                for j in 0..<width * height * 4 {
+                    str += "index \(j): \(p[j]) "
+                }
+            }
+            
+            print(str)
+            bytes.deallocate()
         }
-      }
+        return nil
+        
     }
-    return output
-  }
-  
-  func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
-//    print("origin dim: \(dim)")
-//    print("texture: ")
-//    print(self)
     
-    var textureArray: [Float32]
-    if pixelFormat == .rgba32Float {
-      textureArray = floatArray { (i : Float32) -> Float32 in
-        return i
-      }
-    } else if pixelFormat == .rgba16Float {
-      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
-        return i
-      }
-      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
-    } else {
-      fatalError(" 目前还不支持其他类型 ")
+    // n c h w - dim
+    func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
+        var textureArray: [Float32]
+        if pixelFormat == .rgba32Float {
+            textureArray = floatArray { (i : Float32) -> Float32 in
+                return i
+            }
+        } else if pixelFormat == .rgba16Float {
+            
+            var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+                return i
+            }
+            textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+        } else {
+            fatalError(" 目前还不支持其他类型 ")
+        }
+        print(textureArray.count)
+        var output: [Float32] = []
+        for s in 0..<arrayLength {
+            for c in 0..<4{
+                for h in 0..<dim.h {
+                    for w in 0..<dim.w {
+                        if (s * 4 + c) < dim.c {
+                            let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
+                            output.append(textureValue)
+                        }
+                    }
+                }
+            }
+        }
+        return output
     }
     
-    var output: [Float32] = []
-    let numOfASlice = dim.h * dim.w * 4
-    for h in 0..<dim.h {
-      for w in 0..<dim.w {
-        for sliceIndex in 0..<arrayLength {
-          if sliceIndex * 4 + 4 > dim.c {
-            for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) {
-              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
-              output.append(value)
-            }
-          } else {
-            for i in 0..<4 {
-              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
-              output.append(value)
-            }
-          }
-        }
-      }
+    func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
+        //    print("origin dim: \(dim)")
+        //    print("texture: ")
+        //    print(self)
+        
+        var textureArray: [Float32]
+        if pixelFormat == .rgba32Float {
+            textureArray = floatArray { (i : Float32) -> Float32 in
+                return i
+            }
+        } else if pixelFormat == .rgba16Float {
+            var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+                return i
+            }
+            textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+        } else {
+            fatalError(" 目前还不支持其他类型 ")
+        }
+        
+        var output: [Float32] = []
+        let numOfASlice = dim.h * dim.w * 4
+        for h in 0..<dim.h {
+            for w in 0..<dim.w {
+                for sliceIndex in 0..<arrayLength {
+                    if sliceIndex * 4 + 4 > dim.c {
+                        for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) {
+                            let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+                            output.append(value)
+                        }
+                    } else {
+                        for i in 0..<4 {
+                            let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+                            output.append(value)
+                        }
+                    }
+                }
+            }
+        }
+        return output
     }
-    return output
-  }
-  
+    
 }
 
 
 public extension MTLBuffer {
-  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-    print(header)
-    print("MTLBuffer: \(self) ")
-    var str = ""
-    if stridable && length/MemoryLayout<T>.stride > 1000{
-      for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
-        str += " \(contents().assumingMemoryBound(to: T.self)[j])"
-      }
-    } else {
-      for i in 0..<length/MemoryLayout<T>.size {
-        str += " \(contents().assumingMemoryBound(to: T.self)[i])"
-      }
+    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+        print(header)
+        print("MTLBuffer: \(self) ")
+        var str = ""
+        if stridable && length/MemoryLayout<T>.stride > 1000{
+            for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
+                str += " \(contents().assumingMemoryBound(to: T.self)[j])"
+            }
+        } else {
+            for i in 0..<length/MemoryLayout<T>.size {
+                str += " \(contents().assumingMemoryBound(to: T.self)[i])"
+            }
+        }
+        print(str)
+        return nil
     }
-    print(str)
-    return nil
-  }
-  
-  func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.width = textureWidth
-    textureDesc.height = textureHeight
-    textureDesc.depth = 1
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.pixelFormat = .rgba32Float
-    textureDesc.textureType = .type2DArray
-    textureDesc.storageMode = .shared
-    textureDesc.cpuCacheMode = .defaultCache
-    textureDesc.arrayLength = arrayLength
-    let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
-    return texture
-  }
-  
-  func array<T>() -> [T] {
-    var array: [T] = []
-    let pointer = contents().bindMemory(to: T.self, capacity: length)
-    for i in 0..<(length / MemoryLayout<T>.size) {
-      array.append(pointer[i])
+    
+    func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.width = textureWidth
+        textureDesc.height = textureHeight
+        textureDesc.depth = 1
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.pixelFormat = .rgba32Float
+        textureDesc.textureType = .type2DArray
+        textureDesc.storageMode = .shared
+        textureDesc.cpuCacheMode = .defaultCache
+        textureDesc.arrayLength = arrayLength
+        let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
+        return texture
+    }
+    
+    func array<T>() -> [T] {
+        var array: [T] = []
+        let pointer = contents().bindMemory(to: T.self, capacity: length)
+        for i in 0..<(length / MemoryLayout<T>.size) {
+            array.append(pointer[i])
+        }
+        return array;
     }
-    return array;
-  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
index 724a44b0f4c9dcce189bc32abadb1675e01e8e72..52c27cceade8267aaeb5edee26db521419f1cf94 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
@@ -89,135 +89,135 @@ public class PaddleMobileUnitTest {
     }
     
     public func testConcat() {
-//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-//        var it: [[Float32]] = []
-//        for _ in 0..<7 {
-//            it.append((0..<12).map { Float32($0) })
-//        }
-//        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
-//        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
-//
-//        let param = ConcatTestParam.init(
-//            input: input,
-//            output: output,
-//            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
-//            axis: 1,
-//            odim: [3, 28]
-//        )
-//        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
-//        concatKernel.test(cmdBuffer: buffer, param: param)
-//        buffer.addCompletedHandler { (buffer) in
-//            for i in 0..<it.count {
-//                let _: Float32? = input[i].logDesc()
-//                self.tensorPrint(tensor: it[i], dim: [3, 4])
-//            }
-//            let _: Float32? = output.logDesc()
-//            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
-//            self.tensorPrint(tensor: tx, dim: [3, 28])
-//        }
-//
-//        buffer.commit()
+        //        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+        //        var it: [[Float32]] = []
+        //        for _ in 0..<7 {
+        //            it.append((0..<12).map { Float32($0) })
+        //        }
+        //        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
+        //        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
+        //
+        //        let param = ConcatTestParam.init(
+        //            input: input,
+        //            output: output,
+        //            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
+        //            axis: 1,
+        //            odim: [3, 28]
+        //        )
+        //        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
+        //        concatKernel.test(cmdBuffer: buffer, param: param)
+        //        buffer.addCompletedHandler { (buffer) in
+        //            for i in 0..<it.count {
+        //                let _: Float32? = input[i].logDesc()
+        //                self.tensorPrint(tensor: it[i], dim: [3, 4])
+        //            }
+        //            let _: Float32? = output.logDesc()
+        //            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
+        //            self.tensorPrint(tensor: tx, dim: [3, 28])
+        //        }
+        //
+        //        buffer.commit()
     }
     
     public func testReshape() {
-//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-//        let input: [Float32] = (0..<24).map { Float32($0) }
-//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-//        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
-//        let mp = ReshapeMetalParam.init(
-//            idim: (1, 2, 3, 4),
-//            itrans: (0, 1, 2, 3),
-//            odim: (1, 1, 4, 6),
-//            otrans: (0, 1, 2, 3)
-//        )
-//        let param = ReshapeTestParam.init(
-//            inputTexture: inTexture,
-//            outputTexture: outTexture,
-//            param: mp
-//        )
-//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
-//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inTexture.logDesc()
-//            let _: Float32? = outTexture.logDesc()
-//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
-//            self.tensorPrint(tensor: tx, dim: [4, 6])
-//        }
+        //        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+        //        let input: [Float32] = (0..<24).map { Float32($0) }
+        //        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+        //        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
+        //        let mp = ReshapeMetalParam.init(
+        //            idim: (1, 2, 3, 4),
+        //            itrans: (0, 1, 2, 3),
+        //            odim: (1, 1, 4, 6),
+        //            otrans: (0, 1, 2, 3)
+        //        )
+        //        let param = ReshapeTestParam.init(
+        //            inputTexture: inTexture,
+        //            outputTexture: outTexture,
+        //            param: mp
+        //        )
+        //        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+        //        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inTexture.logDesc()
+        //            let _: Float32? = outTexture.logDesc()
+        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
+        //            self.tensorPrint(tensor: tx, dim: [4, 6])
+        //        }
         
-//        let input: [Float32] = (0..<24).map { Float32($0) }
-//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-//        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
-//        let mp = ReshapeMetalParam.init(
-//            idim: (1, 2, 3, 4),
-//            itrans: (0, 1, 2, 3),
-//            odim: (1, 1, 1, 24),
-//            otrans: (0, 1, 2, 3)
-//        )
-//        let param = ReshapeTestParam.init(
-//            inputTexture: inTexture,
-//            outputTexture: outTexture,
-//            param: mp
-//        )
-//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
-//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inTexture.logDesc()
-//            let _: Float32? = outTexture.logDesc()
-//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
-//            self.tensorPrint(tensor: tx, dim: [24])
-//        }
-//
-//        
-//        buffer.commit()
+        //        let input: [Float32] = (0..<24).map { Float32($0) }
+        //        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+        //        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
+        //        let mp = ReshapeMetalParam.init(
+        //            idim: (1, 2, 3, 4),
+        //            itrans: (0, 1, 2, 3),
+        //            odim: (1, 1, 1, 24),
+        //            otrans: (0, 1, 2, 3)
+        //        )
+        //        let param = ReshapeTestParam.init(
+        //            inputTexture: inTexture,
+        //            outputTexture: outTexture,
+        //            param: mp
+        //        )
+        //        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+        //        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inTexture.logDesc()
+        //            let _: Float32? = outTexture.logDesc()
+        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
+        //            self.tensorPrint(tensor: tx, dim: [24])
+        //        }
+        //
+        //        
+        //        buffer.commit()
     }
     
     public func testTranspose() {
-
+        
         let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-//        var input: [Float32] = []
-//        for i in 0..<72 {
-//            input.append(Float32(i))
-//        }
-////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
-//        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
-//        // group 1
-//        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
-//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
-////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
-////        // group 2
-////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
-////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
-////
-//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
-//
-//        transposeKernel.test(commandBuffer: buffer, param: param)
-//
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
-//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-//            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
-//            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
-//        }
-//
-//        let input: [Float32] = (0..<24).map { Float32($0) }
-//        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-//        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
-//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
-//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
-//
-//        transposeKernel.test(commandBuffer: buffer, param: param)
-//
-//        buffer.addCompletedHandler { (buffer) in
-//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
-//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
-//            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
-//        }
-//        
+        //        var input: [Float32] = []
+        //        for i in 0..<72 {
+        //            input.append(Float32(i))
+        //        }
+        ////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
+        //        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
+        //        // group 1
+        //        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
+        //        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
+        ////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
+        ////        // group 2
+        ////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
+        ////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
+        ////
+        //        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+        //
+        //        transposeKernel.test(commandBuffer: buffer, param: param)
+        //
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+        //            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+        //            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
+        //            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
+        //        }
+        //
+        //        let input: [Float32] = (0..<24).map { Float32($0) }
+        //        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+        //        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
+        //        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
+        //        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+        //
+        //        transposeKernel.test(commandBuffer: buffer, param: param)
+        //
+        //        buffer.addCompletedHandler { (buffer) in
+        //            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+        //            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+        //            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+        //            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
+        //            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
+        //        }
+        //        
         buffer.commit()
     }
     
@@ -225,72 +225,72 @@ public class PaddleMobileUnitTest {
         let buffer = queue.makeCommandBuffer() ?! " buffer is nil "
         
         let input: [Float32] = [
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-         1.0, 2.0, 3.0, 4.0,
-        ]
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            1.0, 2.0, 3.0, 4.0,
+            ]
         
         let filter: [Float32] = [
-        //1.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        //2.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        //3.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        //4.0
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        1.0, 1.0, 1.0, 1.0,
-        ]
+            //1.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            //2.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            //3.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            //4.0
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            1.0, 1.0, 1.0, 1.0,
+            ]
         
         let biase: [Float32] = [1.0, 1.0, 1.0, 100.0]
         let newScalue: [Float32] = [1.0, 1.0, 1.0, 1.0]
@@ -324,10 +324,10 @@ public class PaddleMobileUnitTest {
         
         let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
         
-      let initContext = InitContext.init()
-      initContext.metalLoadMode = .LoadMetalInDefaultLib
+        let initContext = InitContext.init()
+        initContext.metalLoadMode = .LoadMetalInDefaultLib
         
-      let convAddBnReluKernel = ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param, initContext: initContext)
+        let convAddBnReluKernel = ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param, initContext: initContext)
         
         convAddBnReluKernel.test(commandBuffer: buffer, param: param)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
index ae7b898a8ecedefc21f0dce36a845fb024786246..701bb37bf2442ff30f372d420670ca6f4f7fc85e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift
@@ -16,222 +16,222 @@ import Foundation
 import Accelerate
 
 public protocol SummableMultipliable: Equatable {
-  static func +(lhs: Self, rhs: Self) -> Self
-  static func *(lhs: Self, rhs: Self) -> Self
-  static func -(lhs: Self, rhs: Self) -> Self
+    static func +(lhs: Self, rhs: Self) -> Self
+    static func *(lhs: Self, rhs: Self) -> Self
+    static func -(lhs: Self, rhs: Self) -> Self
 }
 public protocol PrecisionType: SummableMultipliable{
-  init(inFloat: Float32)
-  init(inFloat16: Float16)
-  init<P: PrecisionType>(_ inP: P)
-  static var bitSize: UInt { get }
+    init(inFloat: Float32)
+    init(inFloat16: Float16)
+    init<P: PrecisionType>(_ inP: P)
+    static var bitSize: UInt { get }
 }
 
 public typealias Float16 = Int16
 extension Float16: PrecisionType {
-  public static func * (prefix: Float16, postfix: Float16) {
-    return prefix * postfix
-  }
-  
-  public init<P>(_ inP: P) where P : PrecisionType {
-    if P.bitSize == Float32.bitSize {
-      self = Float16(inFloat: inP as! Float32)
-    } else if P.bitSize == Float16.bitSize {
-      self = inP as! Float16
-    } else {
-      fatalError()
+    public static func * (prefix: Float16, postfix: Float16) {
+        return prefix * postfix
+    }
+    
+    public init<P>(_ inP: P) where P : PrecisionType {
+        if P.bitSize == Float32.bitSize {
+            self = Float16(inFloat: inP as! Float32)
+        } else if P.bitSize == Float16.bitSize {
+            self = inP as! Float16
+        } else {
+            fatalError()
+        }
+    }
+    
+    public static var bitSize: UInt {
+        return 16
+    }
+    
+    public init(inFloat16: Float16) {
+        self = inFloat16
+    }
+    public init(inFloat: Float32) {
+        self = Int16(inFloat)
     }
-  }
-  
-  public static var bitSize: UInt {
-    return 16
-  }
-  
-  public init(inFloat16: Float16) {
-    self = inFloat16
-  }
-  public init(inFloat: Float32) {
-    self = Int16(inFloat)
-  }
 }
 
 extension Float32: PrecisionType {
-  public init<P>(_ inP: P) where P : PrecisionType {
-    if P.bitSize == Float32.bitSize {
-      self = inP as! Float32
-    } else if P.bitSize == Float16.bitSize {
-      self = Float32.init(inP as! Float16)
-    } else {
-      fatalError()
+    public init<P>(_ inP: P) where P : PrecisionType {
+        if P.bitSize == Float32.bitSize {
+            self = inP as! Float32
+        } else if P.bitSize == Float16.bitSize {
+            self = Float32.init(inP as! Float16)
+        } else {
+            fatalError()
+        }
+    }
+    
+    public init(inFloat: Float32) {
+        self = inFloat
+    }
+    
+    public init(inFloat16: Float16) {
+        self = Float32.init(inFloat16)
+    }
+    
+    public static var bitSize: UInt {
+        return 32
     }
-  }
-  
-  public init(inFloat: Float32) {
-    self = inFloat
-  }
-  
-  public init(inFloat16: Float16) {
-    self = Float32.init(inFloat16)
-  }
-  
-  public static var bitSize: UInt {
-    return 32
-  }
 }
 
 public func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
-  var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
-  var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
-  guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
-    fatalError(" float 32 to float 16 error ! ")
-  }
+    var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+    var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+    guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+        fatalError(" float 32 to float 16 error ! ")
+    }
 }
 
 public func float16To32(input: UnsafeMutablePointer<Float16>, count: Int) -> [Float32] {
-  var output = Array<Float>.init(repeating: 0.0, count: count)
-  float16to32(input: input, output: &output, count: count)
-  return output
+    var output = Array<Float>.init(repeating: 0.0, count: count)
+    float16to32(input: input, output: &output, count: count)
+    return output
 }
 
 public func float16to32(input: UnsafeMutablePointer<Float16>, output: UnsafeMutablePointer<Float32>, count: Int) {
-  var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
-  var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
-  if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
-    fatalError(" convert float16 to float32 error")
-  }
+    var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
+    var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
+    if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
+        fatalError(" convert float16 to float32 error")
+    }
 }
 
 // N - 0   C - 1   H - 2   W - 3
 struct DataLayout {
-  
-  static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
-    return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
-  }
-  
-  static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
-    return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
-  }
-  
-  func count() -> Int {
-    return layoutWithDim.count
-  }
-  
-  var N: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .N {
-          return layoutDim.1
-        }
-      }
-      return nil
+    
+    static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+        return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
     }
-    set {
-      var newN = (Layout.N, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .N
-      }) {
-        fatalError()
-      }
+    
+    static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+        return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
     }
-  }
-  var C: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .C {
-          return layoutDim.1
-        }
-      }
-      return nil
+    
+    func count() -> Int {
+        return layoutWithDim.count
     }
-    set {
-      var newN = (Layout.C, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .N
-      }) {
-        fatalError()
-      }
+    
+    var N: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .N {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.N, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .N
+            }) {
+                fatalError()
+            }
+        }
     }
-  }
-  var H: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .H {
-          return layoutDim.1
+    var C: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .C {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.C, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .N
+            }) {
+                fatalError()
+            }
         }
-      }
-      return nil
     }
-    set {
-      var newN = (Layout.H, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .H
-      }) {
-        fatalError()
-      }
+    var H: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .H {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.H, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .H
+            }) {
+                fatalError()
+            }
+        }
     }
-  }
-  var W: Int? {
-    get {
-      for layoutDim in layoutWithDim {
-        if layoutDim.0 == .W {
-          return layoutDim.1
+    var W: Int? {
+        get {
+            for layoutDim in layoutWithDim {
+                if layoutDim.0 == .W {
+                    return layoutDim.1
+                }
+            }
+            return nil
+        }
+        set {
+            var newN = (Layout.W, newValue)
+            if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+                return layout == .W
+            }) {
+                fatalError()
+            }
         }
-      }
-      return nil
     }
-    set {
-      var newN = (Layout.W, newValue)
-      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
-        return layout == .W
-      }) {
-        fatalError()
-      }
+    
+    
+    init(_ inLayout: [(Layout, Int)]) {
+        layoutWithDim = inLayout
     }
-  }
-  
-  
-  init(_ inLayout: [(Layout, Int)]) {
-    layoutWithDim = inLayout
-  }
-  
-  func layout() -> [Layout] {
-    return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
-      return layout
-    })
-  }
-  
-  var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
-  
-  func convertTo(inLayout: [Layout]) {
     
-  }
-  
-  enum Layout: Int{
-    case N = 0
-    case C = 1
-    case H = 2
-    case W = 3
-    static func defaultLayout() -> [Layout] {
-      return [N, C, H, W]
+    func layout() -> [Layout] {
+        return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
+            return layout
+        })
+    }
+    
+    var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
+    
+    func convertTo(inLayout: [Layout]) {
+        
+    }
+    
+    enum Layout: Int{
+        case N = 0
+        case C = 1
+        case H = 2
+        case W = 3
+        static func defaultLayout() -> [Layout] {
+            return [N, C, H, W]
+        }
     }
-  }
 }
 
 extension DataLayout: Equatable {
-  public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
-    if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
-      var result = true
-      for i in 0..<lhs.layoutWithDim.count {
-        result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
-        if !result {
-          break
+    public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
+        if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
+            var result = true
+            for i in 0..<lhs.layoutWithDim.count {
+                result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
+                if !result {
+                    break
+                }
+            }
+            return result
+        } else {
+            return false
         }
-      }
-      return result
-    } else {
-      return false
     }
-  }
 }
 
 public protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
@@ -253,42 +253,42 @@ extension MTLTexture where Self: Variant {
 }
 
 public class FetchHolder: Variant {
-  var resultBuffer: MTLBuffer?
-  public var dim: Dim
-  public var capacity: Int
-  public var paddedCapacity: Int
-  
-  init(inPaddedCapacity: Int, inDim: Dim) {
-    paddedCapacity = inPaddedCapacity
-    capacity = inDim.numel()
-    dim = inDim
-  }
-  
-  public func initBuffer(device: MTLDevice) {
-    resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: [])
-  }
-  
-  var result: UnsafeMutablePointer<Float32> {
-    guard let inResultBuffer = resultBuffer else {
-      fatalError()
+    var resultBuffer: MTLBuffer?
+    public var dim: Dim
+    public var capacity: Int
+    public var paddedCapacity: Int
+    
+    init(inPaddedCapacity: Int, inDim: Dim) {
+        paddedCapacity = inPaddedCapacity
+        capacity = inDim.numel()
+        dim = inDim
     }
-    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity)
-  }
-  
+    
+    public func initBuffer(device: MTLDevice) {
+        resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: [])
+    }
+    
+    var result: UnsafeMutablePointer<Float32> {
+        guard let inResultBuffer = resultBuffer else {
+            fatalError()
+        }
+        return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity)
+    }
+    
 }
 
 extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
-  public var description: String {
-    fatalError()
-//    return "\(result)"
-  }
-  
-  public var debugDescription: String {
-    fatalError()
-//    return "\(result)"
-  }
-  
-  
+    public var description: String {
+        fatalError()
+        //    return "\(result)"
+    }
+    
+    public var debugDescription: String {
+        fatalError()
+        //    return "\(result)"
+    }
+    
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
index 1817184bf7d5ef7ca9cbe6e9fd829aa14b564dc6..77b67bf16ca248c2e3d9bac525c5ee8d64d67255 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 @objc public class Dim: NSObject {
-  private(set) var dims: [Int]
-  
-  @objc public init(inDim: [Int]) {
-    dims = inDim
-  }
-  
-  public func cout() -> Int {
-    return dims.count
-  }
-  
-  public func numel() -> Int {
-    return dims.reduce(1) { $0 * $1 }
-  }
-  
-  public static func ==(left: Dim, right: Dim) -> Bool {
-    return left.dims == right.dims;
-  }
-  
-  public static func !=(left: Dim, right: Dim) -> Bool {
-    return left.dims != right.dims;
-  }
-  
-  public subscript(index: Int) -> Int {
-    return dims[index];
-  }
-  
-  public override var description: String {
-    return "\(dims)"
-  }
-  
-  func swapeDimAt(index1: Int, index2: Int) {
-    dims.swapAt(index1, index2)
-  }
-  
-  private override init(){
-    fatalError()
-  }
+    private(set) var dims: [Int]
+    
+    @objc public init(inDim: [Int]) {
+        dims = inDim
+    }
+    
+    public func cout() -> Int {
+        return dims.count
+    }
+    
+    public func numel() -> Int {
+        return dims.reduce(1) { $0 * $1 }
+    }
+    
+    public static func ==(left: Dim, right: Dim) -> Bool {
+        return left.dims == right.dims;
+    }
+    
+    public static func !=(left: Dim, right: Dim) -> Bool {
+        return left.dims != right.dims;
+    }
+    
+    public subscript(index: Int) -> Int {
+        return dims[index];
+    }
+    
+    public override var description: String {
+        return "\(dims)"
+    }
+    
+    func swapeDimAt(index1: Int, index2: Int) {
+        dims.swapAt(index1, index2)
+    }
+    
+    private override init(){
+        fatalError()
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
index 8f02bf17bcf0eed9e6b7a7bcb524cf6d18aa49fa..9f257200b12edda19085b5414b1cafcc30ea6153 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
@@ -14,136 +14,141 @@
 
 import Foundation
 
-
 let testTo = 5
 
 var isTest = false
 
 @objc public class GPUResultHolder: NSObject{
-  @objc public let dim: [Int]
-  @objc public let capacity: Int
-  @objc public var resultPointer: UnsafeMutablePointer<Float32>?
-  @objc public var intermediateResults: [String : [MTLBuffer]]?
-  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) {
-    dim = inDim
-    capacity = inCapacity
+    @objc public let dim: [Int]
+    @objc public let capacity: Int
+    @objc public var resultPointer: UnsafeMutablePointer<Float32>?
+    @objc public var intermediateResults: [String : [MTLBuffer]]?
+    public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) {
+        dim = inDim
+        capacity = inCapacity
+        
+        if let inInPointer = inPointer {
+            resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
+            resultPointer?.initialize(from: inInPointer, count: inCapacity)
+        }
+        
+        intermediateResults = inIntermediateResults
+    }
     
-    if let inInPointer = inPointer {
-      resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
-      resultPointer?.initialize(from: inInPointer, count: inCapacity)
+    public override var description: String {
+        fatalError()
     }
     
-    intermediateResults = inIntermediateResults
-  }
-  
-  public override var description: String {
-    fatalError()
-  }
-  
 }
 
 public class Executor<P: PrecisionType> {
-  var ops: [Runable & InferShaperable] = []
-  var preInputDim: Dim = Dim.init(inDim: [])
-  let program: Program
-  let device: MTLDevice
-  let inflightSemaphore: DispatchSemaphore
-  let queue: MTLCommandQueue
-  init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws {
-    self.inflightSemaphore = DispatchSemaphore(value: 1)
-    program = inProgram
-    device = inDevice
-    queue = inQueue
-    
-    for block in inProgram.programDesc.blocks {
-      //block.ops.count
-      for i in 0..<block.ops.count {
-        let opDesc = block.ops[i]
-        do {
-          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext)
-          ops.append(op)
-        } catch let error {
-          throw error
+    var ops: [Runable & InferShaperable] = []
+    var preInputDim: Dim = Dim.init(inDim: [])
+    let program: Program
+    let device: MTLDevice
+    let inflightSemaphore: DispatchSemaphore
+    let queue: MTLCommandQueue
+    init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws {
+        self.inflightSemaphore = DispatchSemaphore(value: 1)
+        program = inProgram
+        device = inDevice
+        queue = inQueue
+        
+        for block in inProgram.programDesc.blocks {
+            //block.ops.count
+            for i in 0..<block.ops.count {
+                let opDesc = block.ops[i]
+                do {
+                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext)
+                    ops.append(op)
+                } catch let error {
+                    throw error
+                }
+            }
         }
-      }
-    }
-  }
-  
-  public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
-    inflightSemaphore.wait()
-
-    guard let buffer = queue.makeCommandBuffer() else {
-      throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
-    }
-    
-    let resInput: MTLTexture
-    if let inPre = preProcessKernle {
-      do {
-        try inPre.compute(inputTexuture: input, commandBuffer: buffer)
-        resInput = inPre.outputTexture
-      } catch let error {
-        throw error
-      }
-    } else {
-      resInput = input
     }
     
-    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim)
-    program.scope.setInput(input: inputTexture)
-    //(ops.count - except)
-    for i in 0..<(ops.count - except) {
-      let op = ops[i]
-      do {
-        try op.run(device: device, buffer: buffer)
-      } catch let error {
-        throw error
-      }
-    }
-    
-    var outputTextures: [String : [MTLBuffer]]?
-    if except > 0 {
-      ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
-      outputTextures = ops[ops.count - except].inputVariant()
+    public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+        inflightSemaphore.wait()
+        
+        guard let buffer = queue.makeCommandBuffer() else {
+            throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
+        }
+        
+        let resInput: MTLTexture
+        if let inPre = preProcessKernle {
+            do {
+                try inPre.compute(inputTexuture: input, commandBuffer: buffer)
+                resInput = inPre.outputTexture
+            } catch let error {
+                throw error
+            }
+        } else {
+            resInput = input
+        }
+        
+        let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim)
+        program.scope.setInput(input: inputTexture)
+        //(ops.count - except)
+        for i in 0..<(ops.count - except) {
+            let op = ops[i]
+            do {
+                try op.run(device: device, buffer: buffer)
+            } catch let error {
+                throw error
+            }
+        }
+        
+        var outputTextures: [String : [MTLBuffer]]?
+        if except > 0 {
+            ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
+            outputTextures = ops[ops.count - except].inputVariant()
+        }
+        
+        buffer.addCompletedHandler { [weak self] (commandbuffer) in
+            guard let SSelf = self else {
+                fatalError()
+            }
+            
+            //将输入写进文件
+            /*
+             
+             let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+             print(dim)
+             writeToLibrary(fileName: "yolo_input", array: inputArr)
+             print(" write done ")
+             return
+             */
+            
+            
+            
+            //输出 op 计算结果
+            if GlobalConfig.shared.debug {
+                for i in 0..<SSelf.ops.count {
+                    print("第 \(i) 个 op: " )
+                    let op = SSelf.ops[i]
+                    op.delogOutput()
+                }
+            }
+            
+            var resultHolder: GPUResultHolder
+            if except > 0 {
+                resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0,  inIntermediateResults: outputTextures)
+            } else {
+                let outputVar: Variant = SSelf.program.scope.output()!
+                let output: FetchHolder = outputVar as! FetchHolder
+                resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity)
+            }
+            
+            completionHandle([resultHolder])
+            SSelf.inflightSemaphore.signal()
+        }
+        
+        buffer.commit()
     }
     
-    buffer.addCompletedHandler { [weak self] (commandbuffer) in
-      guard let SSelf = self else {
-        fatalError()
-      }
-            
-      //将输入写进文件
-      /*
-       let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-       print(dim)
-       writeToLibrary(fileName: "test_image_super", array: inputArr)
-       print(" write done ")
-       return
-       */
-      
-      /*    输出 op 计算结果
-       for op in SSelf.ops {
-       op.delogOutput()
-       }
-       */
-      
-      var resultHolder: GPUResultHolder
-      if except > 0 {
-        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0,  inIntermediateResults: outputTextures)
-      } else {
-        let outputVar: Variant = SSelf.program.scope.output()!
-        let output: FetchHolder = outputVar as! FetchHolder
-        resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity)
-      }
-      
-      completionHandle([resultHolder])
-      SSelf.inflightSemaphore.signal()
+    public func clear() {
+        program.scope.clear()
     }
     
-    buffer.commit()
-  }
-  
-  public func clear() {
-    program.scope.clear()
-  }
-  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
index 1d4f0ec14fa6442be708e729ce841969a12f5582..790b961480982157da2ba4737442285c3676a2ef 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
@@ -16,251 +16,251 @@ import Foundation
 //import SwiftProtobuf
 
 public class Loader<P: PrecisionType> {
-  class ParaLoader {
-    let file: UnsafeMutablePointer<FILE>
-    let fileSize: Int
-    var nowIndex: Int
-    init(paramPath: String) throws {
-      guard let tmpFile = fopen(paramPath, "rb") else {
-        throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
-      }
-      file = tmpFile
-      fseek(file, 0, SEEK_END)
-      fileSize = ftell(file)
-      guard fileSize > 0 else {
-        throw PaddleMobileError.loaderError(message: "param file size is too small")
-      }
-      rewind(file)
-      nowIndex = 0
-    }
-    
-    func read(tensor: Tensor<P>) throws {
-      guard nowIndex <= fileSize else {
-        throw PaddleMobileError.loaderError(message: "out of the file range")
-      }
-      
-      func pointerReader<T>(type: T.Type) -> T {
-        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
-        fread(ptr, 1, MemoryLayout<T>.size, file)
-        nowIndex += MemoryLayout<T>.size
-        let pointee = ptr.pointee
-        ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-        ptr.deallocate()
-        return pointee
-      }
-      
-      let _ = pointerReader(type: UInt32.self)
-      let lodLevel = pointerReader(type: UInt64.self)
-      for _ in 0..<lodLevel {
-        let size = pointerReader(type: UInt64.self)
-        for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-          _ = pointerReader(type: size_t.self)
-        }
-      }
-      
-      let _ = pointerReader(type: UInt32.self)
-      
-      let tensorDescSize = pointerReader(type: Int32.self)
-      
-      fseek(file, Int(tensorDescSize), SEEK_CUR)
-      nowIndex += Int(tensorDescSize)
-      
-      /*
-       这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
-       */
-      
-      //现在模型传入模型为  Float 类型, 这块应该根据模型来
-      //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-      //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
-      let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
-      
-      guard bytesRead == tensor.data.size else {
-        throw PaddleMobileError.loaderError(message: "param read size error")
-      }
-      
-      // TODO: use script to convert
-      //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-      //            for i in 0..<tensor.numel() {
-      //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-      //            }
-      //            tmpPointer.deinitialize(count: tmpCapacity)
-      //            tmpPointer.deallocate()
-      
-      nowIndex += bytesRead
-    }
-    
-    deinit {
-      fclose(file)
-    }
-  }
-  class ParaLoaderWithPointer {
-    var paramPointer: UnsafeMutableRawPointer
-      let paramSize: Int
-      var nowIndex: Int
-      init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
-          paramPointer = UnsafeMutableRawPointer.init(pPointer)
-          paramSize = pSize
-          nowIndex = 0
-      }
-    
-      func read(tensor: Tensor<P>) throws {
-        guard nowIndex <= paramSize else {
-          throw PaddleMobileError.loaderError(message: "out of the file range")
-        }
-        var readerIndex: Int = 0
-        func pointerReader<T>(type: T.Type) -> T {
-          let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
-          memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout<T>.size)
-          nowIndex += MemoryLayout<T>.size
-          readerIndex += MemoryLayout<T>.size
-          let pointee = ptr.pointee
-          ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-          ptr.deallocate()
-          
-          return pointee
-        }
-        let _ = pointerReader(type: UInt32.self)
-        let lodLevel = pointerReader(type: UInt64.self)
-        for _ in 0..<lodLevel {
-          let size = pointerReader(type: UInt64.self)
-          for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-            _ = pointerReader(type: size_t.self)
-          }
+    class ParaLoader {
+        let file: UnsafeMutablePointer<FILE>
+        let fileSize: Int
+        var nowIndex: Int
+        init(paramPath: String) throws {
+            guard let tmpFile = fopen(paramPath, "rb") else {
+                throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+            }
+            file = tmpFile
+            fseek(file, 0, SEEK_END)
+            fileSize = ftell(file)
+            guard fileSize > 0 else {
+                throw PaddleMobileError.loaderError(message: "param file size is too small")
+            }
+            rewind(file)
+            nowIndex = 0
         }
         
-        let _ = pointerReader(type: UInt32.self)
-        let tensorDescSize = pointerReader(type: Int32.self)
-        
-        paramPointer = paramPointer.advanced(by: Int(readerIndex))
-        paramPointer = paramPointer.advanced(by: Int(tensorDescSize))
-        nowIndex += Int(tensorDescSize)
+        func read(tensor: Tensor<P>) throws {
+            guard nowIndex <= fileSize else {
+                throw PaddleMobileError.loaderError(message: "out of the file range")
+            }
+            
+            func pointerReader<T>(type: T.Type) -> T {
+                let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+                fread(ptr, 1, MemoryLayout<T>.size, file)
+                nowIndex += MemoryLayout<T>.size
+                let pointee = ptr.pointee
+                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+                ptr.deallocate()
+                return pointee
+            }
+            
+            let _ = pointerReader(type: UInt32.self)
+            let lodLevel = pointerReader(type: UInt64.self)
+            for _ in 0..<lodLevel {
+                let size = pointerReader(type: UInt64.self)
+                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+                    _ = pointerReader(type: size_t.self)
+                }
+            }
+            
+            let _ = pointerReader(type: UInt32.self)
+            
+            let tensorDescSize = pointerReader(type: Int32.self)
+            
+            fseek(file, Int(tensorDescSize), SEEK_CUR)
+            nowIndex += Int(tensorDescSize)
+            
+            /*
+             这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
+             */
+            
+            //现在模型传入模型为  Float 类型, 这块应该根据模型来
+            //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+            //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
+            
+            guard bytesRead == tensor.data.size else {
+                throw PaddleMobileError.loaderError(message: "param read size error")
+            }
+            
+            // TODO: use script to convert
+            //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+            //            for i in 0..<tensor.numel() {
+            //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+            //            }
+            //            tmpPointer.deinitialize(count: tmpCapacity)
+            //            tmpPointer.deallocate()
+            
+            nowIndex += bytesRead
+        }
         
-        let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size)
-        paramPointer = paramPointer.advanced(by: Int(tensor.data.size))
-        nowIndex += tensor.data.size
-    }
-    deinit {
+        deinit {
+            fclose(file)
+        }
     }
-  }
-  public init(){}
-  func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program {
-    do {
-        /// swift protobuf serialized Data to instance class
-        //      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
-        //        serializedData: modelData)
+    class ParaLoaderWithPointer {
+        var paramPointer: UnsafeMutableRawPointer
+        let paramSize: Int
+        var nowIndex: Int
+        init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
+            paramPointer = UnsafeMutableRawPointer.init(pPointer)
+            paramSize = pSize
+            nowIndex = 0
+        }
         
-        /// oc protobuf serialized Data to instance class
-      let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data)
-      
-      let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram)
-      let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
-      
-//      let programDesc = PMProgramDesc.init(protoProgram: protoProgram)
-
-      print(programDesc)
-      
-      guard programDesc.blocks.count > 0 else {
-        throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
-      }
-      
-      // to get feed key and fetch key
-      let block = programDesc.blocks[0]
-      guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
-        throw PaddleMobileError.loaderError(message: "at least two operator")
-      }
-      
-      guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
-        throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
-      }
-      
-      guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
-        throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
-      }
-      guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
-        throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
-      }
-      
-      let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
-      
-      // to load memory
-      for block in programDesc.blocks {
-        for varDesc in block.vars {
-          if (varDesc.type == .LodTensor) {
-            guard let tensorDesc = varDesc.tensorDesc else {
-              throw PaddleMobileError.loaderError(message: "get tensor desc failed")
+        func read(tensor: Tensor<P>) throws {
+            guard nowIndex <= paramSize else {
+                throw PaddleMobileError.loaderError(message: "out of the file range")
             }
-            
-            if (varDesc.persistable
-              && varDesc.type != .FeedMiniBatch
-              && varDesc.type != .FetchList) {
-              let dimArr = tensorDesc.dims
-              
-              guard dimArr.count > 0 else {
-                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
-              }
-              
-              let dim = Dim.init(inDim: dimArr)
-              let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
-              do {
-                if paraLoaderPointer != nil {
-                  try paraLoaderPointer!.read(tensor: tensor)
-                }
+            var readerIndex: Int = 0
+            func pointerReader<T>(type: T.Type) -> T {
+                let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+                memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout<T>.size)
+                nowIndex += MemoryLayout<T>.size
+                readerIndex += MemoryLayout<T>.size
+                let pointee = ptr.pointee
+                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+                ptr.deallocate()
                 
-                if paraLoader != nil {
-                  try paraLoader!.read(tensor: tensor)
-                }
-              } catch let error {
-                throw error
-              }
-              //              tensor.convert(to: DataLayout.NHWC())
-              //                            tensor.initBuffer(device: device)
-              scope[varDesc.name] = tensor
-            } else {
-              let dim = Dim.init(inDim: tensorDesc.dims)
-              scope[varDesc.name] = Texture.init(device: device, inDim: dim)
+                return pointee
             }
-          } else {
-            if varDesc.name == fetchKey {
-//              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
-            } else if varDesc.name == feedKey {
+            let _ = pointerReader(type: UInt32.self)
+            let lodLevel = pointerReader(type: UInt64.self)
+            for _ in 0..<lodLevel {
+                let size = pointerReader(type: UInt64.self)
+                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+                    _ = pointerReader(type: size_t.self)
+                }
             }
-          }
+            
+            let _ = pointerReader(type: UInt32.self)
+            let tensorDescSize = pointerReader(type: Int32.self)
+            
+            paramPointer = paramPointer.advanced(by: Int(readerIndex))
+            paramPointer = paramPointer.advanced(by: Int(tensorDescSize))
+            nowIndex += Int(tensorDescSize)
+            
+            let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size)
+            paramPointer = paramPointer.advanced(by: Int(tensor.data.size))
+            nowIndex += tensor.data.size
+        }
+        deinit {
         }
-      }
-      
-      let program = Program.init(inProgramDesc: programDesc, inScope: scope)
-      
-      return program
-    } catch _ {
-      throw PaddleMobileError.loaderError(message: "protobuf decoder error")
-    }
-  }
-  public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program {
-    let modelData = Data.init(bytes:modePointer, count:modelSize)
-    guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
-      throw PaddleMobileError.loaderError(message: "load para error")
-    }
-    do {
-      let program = try loadModelandParam(device,modelData,paraLoader,nil)
-      return program
-    } catch let error {
-      throw error
     }
-  }
-    
-  public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
-    guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
-      throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+    public init(){}
+    func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program {
+        do {
+            /// swift protobuf serialized Data to instance class
+            //      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
+            //        serializedData: modelData)
+            
+            /// oc protobuf serialized Data to instance class
+            let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data)
+            
+            let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram)
+            let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
+            
+            //      let programDesc = PMProgramDesc.init(protoProgram: protoProgram)
+            
+            print(programDesc)
+            
+            guard programDesc.blocks.count > 0 else {
+                throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
+            }
+            
+            // to get feed key and fetch key
+            let block = programDesc.blocks[0]
+            guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
+                throw PaddleMobileError.loaderError(message: "at least two operator")
+            }
+            
+            guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
+                throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
+            }
+            
+            guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
+                throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
+            }
+            guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
+                throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
+            }
+            
+            let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
+            
+            // to load memory
+            for block in programDesc.blocks {
+                for varDesc in block.vars {
+                    if (varDesc.type == .LodTensor) {
+                        guard let tensorDesc = varDesc.tensorDesc else {
+                            throw PaddleMobileError.loaderError(message: "get tensor desc failed")
+                        }
+                        
+                        if (varDesc.persistable
+                            && varDesc.type != .FeedMiniBatch
+                            && varDesc.type != .FetchList) {
+                            let dimArr = tensorDesc.dims
+                            
+                            guard dimArr.count > 0 else {
+                                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
+                            }
+                            
+                            let dim = Dim.init(inDim: dimArr)
+                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+                            do {
+                                if paraLoaderPointer != nil {
+                                    try paraLoaderPointer!.read(tensor: tensor)
+                                }
+                                
+                                if paraLoader != nil {
+                                    try paraLoader!.read(tensor: tensor)
+                                }
+                            } catch let error {
+                                throw error
+                            }
+                            //              tensor.convert(to: DataLayout.NHWC())
+                            //                            tensor.initBuffer(device: device)
+                            scope[varDesc.name] = tensor
+                        } else {
+                            let dim = Dim.init(inDim: tensorDesc.dims)
+                            scope[varDesc.name] = Texture.init(device: device, inDim: dim)
+                        }
+                    } else {
+                        if varDesc.name == fetchKey {
+                            //              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
+                        } else if varDesc.name == feedKey {
+                        }
+                    }
+                }
+            }
+            
+            let program = Program.init(inProgramDesc: programDesc, inScope: scope)
+            
+            return program
+        } catch _ {
+            throw PaddleMobileError.loaderError(message: "protobuf decoder error")
+        }
     }
-    guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
-      throw PaddleMobileError.loaderError(message: "load para error")
+    public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program {
+        let modelData = Data.init(bytes:modePointer, count:modelSize)
+        guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
+            throw PaddleMobileError.loaderError(message: "load para error")
+        }
+        do {
+            let program = try loadModelandParam(device,modelData,paraLoader,nil)
+            return program
+        } catch let error {
+            throw error
+        }
     }
     
-    do {
-      let program = try loadModelandParam(device,modelData,nil,paraLoader)
-      return program
-    } catch let error {
-      throw error
+    public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
+        guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
+            throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+        }
+        guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
+            throw PaddleMobileError.loaderError(message: "load para error")
+        }
+        
+        do {
+            let program = try loadModelandParam(device,modelData,nil,paraLoader)
+            return program
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
index 97fe0a8fbadf443a5b71ce150c37c4b023af65c9..adce1015520d4e429d02ec46bbb0c69ffc5de6ac 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
@@ -17,337 +17,337 @@ import MetalKit
 import CoreMedia
 
 protocol Tensorial: Variant {
-  var dim: Dim { get set }
-  func numel() -> Int
-  var layout: DataLayout { get }
+    var dim: Dim { get set }
+    func numel() -> Int
+    var layout: DataLayout { get }
 }
 
 extension Tensorial {
-  func numel() -> Int {
-    return dim.numel()
-  }
+    func numel() -> Int {
+        return dim.numel()
+    }
 }
 
 
 
 class Tensor<P: PrecisionType>: Tensorial {
-  
-  var data: Data
-  var dim: Dim
-  var buffer: MTLBuffer!
-  private(set) var layout: DataLayout
-  
-  class Data {
-    init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
-      size = inSize
-      pointer = inPointer
-    }
-    let size: Int
-    var pointer: UnsafeMutablePointer<P>
-    subscript(index: Int) -> P{
-      get {
-        return pointer[index]
-      }
-      set {
-        pointer[index] = newValue
-      }
-    }
-    func release() {
-      pointer.deinitialize(count: size)
-      pointer.deallocate()
-    }
-    deinit {
-      //            release()
-    }
-  }
-  
-  init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
-    dim = inDim
-    let size = inDim.numel() * MemoryLayout<P>.size
-    let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
-    data = Data.init(inSize: size, inPointer: pointer)
-    layout = inLayout
-  }
-  
-  func convert(to: DataLayout) {
-    guard to != layout else {
-      return
-    }
     
-    guard dim.cout() == 4 else {
-      return
-    }
+    var data: Data
+    var dim: Dim
+    var buffer: MTLBuffer!
+    private(set) var layout: DataLayout
     
-    guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
-      // other not support
-      return
-    }
-    let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
-    
-    if layout == DataLayout.NCHW() {
-      NCHW2NHWC(newPtr: newPointer)
+    class Data {
+        init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
+            size = inSize
+            pointer = inPointer
+        }
+        let size: Int
+        var pointer: UnsafeMutablePointer<P>
+        subscript(index: Int) -> P{
+            get {
+                return pointer[index]
+            }
+            set {
+                pointer[index] = newValue
+            }
+        }
+        func release() {
+            pointer.deinitialize(count: size)
+            pointer.deallocate()
+        }
+        deinit {
+            //            release()
+        }
     }
     
-    data.release()
-    data.pointer = newPointer
-    layout = to
-  }
-  
-
-  
-  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) {
-    if convertToNHWC {
-//      print(layout)
-      convert(to: DataLayout.NHWC())
+    init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+        dim = inDim
+        let size = inDim.numel() * MemoryLayout<P>.size
+        let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
+        data = Data.init(inSize: size, inPointer: pointer)
+        layout = inLayout
     }
     
-    if withTranspose {
-      let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
-      let n = dim[0]
-      let hwc = numel()/n
-      for j in 0..<hwc {
-        for i in 0..<n {
-          //data[i * hwc + j]
-          transposePointer[j * n + i] = data[i * hwc + j]
+    func convert(to: DataLayout) {
+        guard to != layout else {
+            return
         }
-      }
-
-      dim.swapeDimAt(index1: 0, index2: 3)
-      data.release()
-      data.pointer = transposePointer
+        
+        guard dim.cout() == 4 else {
+            return
+        }
+        
+        guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
+            // other not support
+            return
+        }
+        let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
+        
+        if layout == DataLayout.NCHW() {
+            NCHW2NHWC(newPtr: newPointer)
+        }
+        
+        data.release()
+        data.pointer = newPointer
+        layout = to
     }
     
-    guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
-      fatalError(" not support yet ")
-    }
     
-    let precisionSize: Int
-    switch precision {
-    case .Float32:
-      precisionSize = 4
-    case .Float16:
-      precisionSize = 2
-    }
     
-    if dim.cout() == 4 {
-      if layout == DataLayout.NHWC() {
-        let C = dim[3]
-        let cSlices = (C + 3) / 4
-        let paddedC = cSlices * 4
-        let count = paddedC * dim[0] * dim[1] * dim[2]
-        if C == paddedC {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
-          }
-        } else if C == 1 && !padWhenOneC {
-          buffer = device.makeBuffer(length: numel() * precisionSize)
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-          }
-        } else {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
-          var tmpPointer = floatPointer
-          var dstPtr = convertedPointer
-          for _ in 0..<dim[0] * dim[1] * dim[2] {
-            for j in 0..<paddedC {
-              if j < C {
-                dstPtr[j] = tmpPointer[j]
-              } else {
-                dstPtr[j] = 0
-              }
+    func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) {
+        if convertToNHWC {
+            //      print(layout)
+            convert(to: DataLayout.NHWC())
+        }
+        
+        if withTranspose {
+            let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
+            let n = dim[0]
+            let hwc = numel()/n
+            for j in 0..<hwc {
+                for i in 0..<n {
+                    //data[i * hwc + j]
+                    transposePointer[j * n + i] = data[i * hwc + j]
+                }
             }
-            tmpPointer += C
-            dstPtr += paddedC
-          }
-          
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
-          }
-          
-          convertedPointer.deinitialize(count: count)
-          convertedPointer.deallocate()
+            
+            dim.swapeDimAt(index1: 0, index2: 3)
+            data.release()
+            data.pointer = transposePointer
         }
-      } else {
-        let C = dim[3]
-        let cSlices = (C + 3) / 4
-        let paddedC = cSlices * 4
-        let count = paddedC * dim[0] * dim[1] * dim[2]
-        if C == paddedC {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
-          }
-        } else if C == 1 {
-          fatalError(" not support ")
-        } else {
-          buffer = device.makeBuffer(length: count * precisionSize)
-          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
-          var tmpPointer = floatPointer
-          var dstPtr = convertedPointer
-          for _ in 0..<dim[0] * dim[1] * dim[2] {
-            for j in 0..<paddedC {
-              if j < C {
-                dstPtr[j] = tmpPointer[j]
-              } else {
-                dstPtr[j] = 0
-              }
+        
+        guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
+            fatalError(" not support yet ")
+        }
+        
+        let precisionSize: Int
+        switch precision {
+        case .Float32:
+            precisionSize = 4
+        case .Float16:
+            precisionSize = 2
+        }
+        
+        if dim.cout() == 4 {
+            if layout == DataLayout.NHWC() {
+                let C = dim[3]
+                let cSlices = (C + 3) / 4
+                let paddedC = cSlices * 4
+                let count = paddedC * dim[0] * dim[1] * dim[2]
+                if C == paddedC {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+                    }
+                } else if C == 1 && !padWhenOneC {
+                    buffer = device.makeBuffer(length: numel() * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+                    }
+                } else {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+                    var tmpPointer = floatPointer
+                    var dstPtr = convertedPointer
+                    for _ in 0..<dim[0] * dim[1] * dim[2] {
+                        for j in 0..<paddedC {
+                            if j < C {
+                                dstPtr[j] = tmpPointer[j]
+                            } else {
+                                dstPtr[j] = 0
+                            }
+                        }
+                        tmpPointer += C
+                        dstPtr += paddedC
+                    }
+                    
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+                    }
+                    
+                    convertedPointer.deinitialize(count: count)
+                    convertedPointer.deallocate()
+                }
+            } else {
+                let C = dim[3]
+                let cSlices = (C + 3) / 4
+                let paddedC = cSlices * 4
+                let count = paddedC * dim[0] * dim[1] * dim[2]
+                if C == paddedC {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+                    }
+                } else if C == 1 {
+                    fatalError(" not support ")
+                } else {
+                    buffer = device.makeBuffer(length: count * precisionSize)
+                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+                    var tmpPointer = floatPointer
+                    var dstPtr = convertedPointer
+                    for _ in 0..<dim[0] * dim[1] * dim[2] {
+                        for j in 0..<paddedC {
+                            if j < C {
+                                dstPtr[j] = tmpPointer[j]
+                            } else {
+                                dstPtr[j] = 0
+                            }
+                        }
+                        tmpPointer += C
+                        dstPtr += paddedC
+                    }
+                    
+                    switch precision {
+                    case .Float32:
+                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+                    case .Float16:
+                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+                    }
+                    convertedPointer.deinitialize(count: count)
+                    convertedPointer.deallocate()
+                }
             }
-            tmpPointer += C
-            dstPtr += paddedC
-          }
-          
-          switch precision {
-          case .Float32:
-            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-          case .Float16:
-            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
-          }
-          convertedPointer.deinitialize(count: count)
-          convertedPointer.deallocate()
+        } else if dim.cout() == 1 {
+            let num = ((numel() + 3) / 4) * 4
+            buffer = device.makeBuffer(length: num * precisionSize)
+            switch precision {
+            case .Float32:
+                buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
+            case .Float16:
+                float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
+            }
+        } else {
+            fatalError(" not support !")
         }
-      }
-    } else if dim.cout() == 1 {
-      let num = ((numel() + 3) / 4) * 4
-      buffer = device.makeBuffer(length: num * precisionSize)
-      switch precision {
-      case .Float32:
-        buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
-      case .Float16:
-        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
-      }
-    } else {
-      fatalError(" not support !")
+        //TODO: release
+        data.release()
     }
-    //TODO: release
-    data.release()
-  }
-  
-  var n: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NCHW() {
-          return dim[0]
-        } else if layout == DataLayout.NHWC() {
-          return dim[0]
-        } else {
-          fatalError(" unsupport ")
+    
+    var n: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NCHW() {
+                    return dim[0]
+                } else if layout == DataLayout.NHWC() {
+                    return dim[0]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  var width: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NHWC() {
-          return dim[2]
-        } else if layout == DataLayout.NCHW() {
-          return dim[3]
-        } else {
-          fatalError(" unsupport ")
+    
+    var width: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NHWC() {
+                    return dim[2]
+                } else if layout == DataLayout.NCHW() {
+                    return dim[3]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  var height: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NHWC() {
-          return dim[1]
-        } else if layout == DataLayout.NCHW() {
-          return dim[2]
-        } else {
-          fatalError(" unsupport ")
+    
+    var height: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NHWC() {
+                    return dim[1]
+                } else if layout == DataLayout.NCHW() {
+                    return dim[2]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  var channel: Int {
-    get {
-      if dim.cout() == 4 {
-        if layout == DataLayout.NHWC() {
-          return dim[3]
-        } else if layout == DataLayout.NCHW() {
-          return dim[1]
-        } else {
-          fatalError(" unsupport ")
+    
+    var channel: Int {
+        get {
+            if dim.cout() == 4 {
+                if layout == DataLayout.NHWC() {
+                    return dim[3]
+                } else if layout == DataLayout.NCHW() {
+                    return dim[1]
+                } else {
+                    fatalError(" unsupport ")
+                }
+            } else {
+                fatalError()
+            }
         }
-      } else {
-        fatalError()
-      }
     }
-  }
-  
-  
-  func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
-    let N = dim[0]
-    let C = dim[1]
-    let H = dim[2]
-    let W = dim[3]
-    let HXW = H * W
-    let CXHXW = C * H * W
     
-    var index: Int = 0
-    for n in 0..<N {
-      for h in 0..<H{
-        for w in 0..<W{
-          for c in 0..<C{
-            newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
-            index += 1
-          }
+    
+    func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
+        let N = dim[0]
+        let C = dim[1]
+        let H = dim[2]
+        let W = dim[3]
+        let HXW = H * W
+        let CXHXW = C * H * W
+        
+        var index: Int = 0
+        for n in 0..<N {
+            for h in 0..<H{
+                for w in 0..<W{
+                    for c in 0..<C{
+                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
+                        index += 1
+                    }
+                }
+            }
         }
-      }
+        dim.swapeDimAt(index1: 1, index2: 3)
     }
-    dim.swapeDimAt(index1: 1, index2: 3)
-  }
 }
 
 extension Tensor {
-  
-  var debugDescription: String {
-    var str = "dim: \(dim) \n"
-    str += "MTLBuffer: \(self.buffer) \n"
-    for i in 0..<buffer.length/MemoryLayout<P>.size {
-      str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
+    
+    var debugDescription: String {
+        var str = "dim: \(dim) \n"
+        str += "MTLBuffer: \(self.buffer) \n"
+        for i in 0..<buffer.length/MemoryLayout<P>.size {
+            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
+        }
+        return str
+    }
+    
+    func logDataPointer(header: String = "") {
+        print(header)
+        var str = ""
+        str += "data size: \(data.size) \n"
+        str += "dim: \(dim) \n"
+        for i in 0..<numel() {
+            str += " \(data.pointer[i])"
+        }
+        print(str)
     }
-    return str
-  }
-  
-  func logDataPointer(header: String = "") {
-    print(header)
-    var str = ""
-    str += "data size: \(data.size) \n"
-    str += "dim: \(dim) \n"
-    for i in 0..<numel() {
-      str += " \(data.pointer[i])"
+    
+    var description: String {
+        return debugDescription
     }
-    print(str)
-  }
-  
-  var description: String {
-    return debugDescription
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
index 14631464d88e85bc6ac6789a2508da5b64dd5857..cc1ed05e121524fec9ff35ce1df3d2c54b4a8c88 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
@@ -16,26 +16,26 @@ import Metal
 import Foundation
 
 class InputTexture {
-  let mtlTexture: MTLTexture
-  let expectDim: Dim
-  init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
-    mtlTexture = inMTLTexture
-    expectDim = inExpectDim
-  }
+    let mtlTexture: MTLTexture
+    let expectDim: Dim
+    init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
+        mtlTexture = inMTLTexture
+        expectDim = inExpectDim
+    }
 }
 
 extension InputTexture {
-  var description: String {
-    get{
-      return mtlTexture.description
+    var description: String {
+        get{
+            return mtlTexture.description
+        }
     }
-  }
-  
-  var debugDescription: String {
-    get {
-      return mtlTexture.debugDescription ?? " MetalTexture "
+    
+    var debugDescription: String {
+        get {
+            return mtlTexture.debugDescription ?? " MetalTexture "
+        }
     }
-  }
 }
 
 
@@ -46,17 +46,17 @@ extension InputTexture {
  .height = b
  .len = a * d + 3 / 4
  
-低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
+ 低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
  
-// TODO transpose 对于低维 tensor 的扩展原则。。。
-// [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
-// [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
-// [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
-
-3 维 tensor [a, b, c] 对应的 texture_2darray,
-.width = c
-.height = b
-.len = a + 3 / 4
+ // TODO transpose 对于低维 tensor 的扩展原则。。。
+ // [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
+ // [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
+ // [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
+ 
+ 3 维 tensor [a, b, c] 对应的 texture_2darray,
+ .width = c
+ .height = b
+ .len = a + 3 / 4
  
  2 维 tensor [a, b] 对应的 texture_2darray
  .width = b + 3 / 4
@@ -69,136 +69,136 @@ extension InputTexture {
  .len = 1
  */
 public class Texture: Tensorial {
-  public var dim: Dim
-  public var tensorDim: Dim
-  
-  /// tensor dim pad to four
-  public var padToFourDim: Dim
-  private var textureDesc: MTLTextureDescriptor!
-  public var metalTexture: MTLTexture!
-  var transpose: [Int] = [0, 1, 2, 3]
-  
-  func elementCount() -> Int {
-    return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4
-  }
-  
-  func toTensor() -> [Float32] {
-    guard  padToFourDim.cout() == 4 else {
-      fatalError("- not support -")
-    }
-    return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-  }
-  
-  func realNHWC() -> [Float32] {
-    guard padToFourDim.cout() == 4 else {
-      fatalError(" - not support - ")
-    }
-    return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-  }
-  
-  public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
-    transpose = inTranspose
-    for i in 0..<(4 - tensorDim.cout()) {
-      if i != inTranspose[i] {
-        fatalError()
-      }
-    }
+    public var dim: Dim
+    public var tensorDim: Dim
     
-    let newDim = transpose.map { padToFourDim[$0] }
-    let newLayout = transpose.map { layout.layoutWithDim[$0] }
+    /// tensor dim pad to four
+    public var padToFourDim: Dim
+    private var textureDesc: MTLTextureDescriptor!
+    public var metalTexture: MTLTexture!
+    var transpose: [Int] = [0, 1, 2, 3]
     
-    layout = DataLayout.init(newLayout)
-    dim = Dim.init(inDim: newDim)
+    func elementCount() -> Int {
+        return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4
+    }
     
-    let tmpTextureDes = MTLTextureDescriptor.init()
-    tmpTextureDes.textureType = .type2DArray
-    tmpTextureDes.depth = 1
+    func toTensor() -> [Float32] {
+        guard  padToFourDim.cout() == 4 else {
+            fatalError("- not support -")
+        }
+        return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+    }
     
-    switch tensorDim.cout() {
-    case 4:
-      tmpTextureDes.width = newDim[2]
-      tmpTextureDes.height = newDim[1]
-      tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
-    case 3:
-      tmpTextureDes.width = newDim[3]
-      tmpTextureDes.height = newDim[2]
-      tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
-    case 2, 1:
-      tmpTextureDes.width = (newDim[3] + 3) / 4
-      tmpTextureDes.height = newDim[2]
-      tmpTextureDes.arrayLength = 1
-    default:
-      fatalError("unreachable")
+    func realNHWC() -> [Float32] {
+        guard padToFourDim.cout() == 4 else {
+            fatalError(" - not support - ")
+        }
+        return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
     }
-   
-    if computePrecision == .Float16 {
-      tmpTextureDes.pixelFormat = .rgba16Float
-    } else if computePrecision == .Float32 {
-      tmpTextureDes.pixelFormat = .rgba32Float
+    
+    public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
+        transpose = inTranspose
+        for i in 0..<(4 - tensorDim.cout()) {
+            if i != inTranspose[i] {
+                fatalError()
+            }
+        }
+        
+        let newDim = transpose.map { padToFourDim[$0] }
+        let newLayout = transpose.map { layout.layoutWithDim[$0] }
+        
+        layout = DataLayout.init(newLayout)
+        dim = Dim.init(inDim: newDim)
+        
+        let tmpTextureDes = MTLTextureDescriptor.init()
+        tmpTextureDes.textureType = .type2DArray
+        tmpTextureDes.depth = 1
+        
+        switch tensorDim.cout() {
+        case 4:
+            tmpTextureDes.width = newDim[2]
+            tmpTextureDes.height = newDim[1]
+            tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
+        case 3:
+            tmpTextureDes.width = newDim[3]
+            tmpTextureDes.height = newDim[2]
+            tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
+        case 2, 1:
+            tmpTextureDes.width = (newDim[3] + 3) / 4
+            tmpTextureDes.height = newDim[2]
+            tmpTextureDes.arrayLength = 1
+        default:
+            fatalError("unreachable")
+        }
+        
+        if computePrecision == .Float16 {
+            tmpTextureDes.pixelFormat = .rgba16Float
+        } else if computePrecision == .Float32 {
+            tmpTextureDes.pixelFormat = .rgba32Float
+        }
+        
+        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
+        tmpTextureDes.storageMode = .shared
+        textureDesc = tmpTextureDes
+        metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
     }
     
-    tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-    tmpTextureDes.storageMode = .shared
-    textureDesc = tmpTextureDes
-    metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
-  }
-  
-  public func updateDims(inTensorDim: Dim, inDim: Dim) {
-    var fourDim: Dim
-    if inDim.cout() == 4 {
-      fourDim = inDim
-    } else if inDim.cout() < 4 {
-      var fourDimNum: [Int] = []
-      for _ in 0..<(4 - inDim.cout()) {
-        fourDimNum.append(1)
-      }
-      fourDimNum.append(contentsOf: inDim.dims)
-      fourDim = Dim.init(inDim: fourDimNum)
-    } else {
-      fatalError(" not support ")
+    public func updateDims(inTensorDim: Dim, inDim: Dim) {
+        var fourDim: Dim
+        if inDim.cout() == 4 {
+            fourDim = inDim
+        } else if inDim.cout() < 4 {
+            var fourDimNum: [Int] = []
+            for _ in 0..<(4 - inDim.cout()) {
+                fourDimNum.append(1)
+            }
+            fourDimNum.append(contentsOf: inDim.dims)
+            fourDim = Dim.init(inDim: fourDimNum)
+        } else {
+            fatalError(" not support ")
+        }
+        
+        tensorDim = inTensorDim
+        dim = fourDim
+        padToFourDim = fourDim
     }
     
-    tensorDim = inTensorDim
-    dim = fourDim
-    padToFourDim = fourDim
-  }
-  
-  // 初始化时 dim padToFourDim 模型中的维度（一般来说 nchw），前面补全0
-  init(device: MTLDevice, inDim: Dim) {
-    print(" in dim > \(inDim)")
-    var fourDim: Dim
-    if inDim.cout() == 4 {
-      fourDim = inDim
-    } else if inDim.cout() < 4 {
-      var fourDimNum: [Int] = []
-      for _ in 0..<(4 - inDim.cout()) {
-        fourDimNum.append(1)
-      }
-      fourDimNum.append(contentsOf: inDim.dims)
-      fourDim = Dim.init(inDim: fourDimNum)
-    } else {
-      fatalError(" not support ")
+    // 初始化时 dim padToFourDim 模型中的维度（一般来说 nchw），前面补全0
+    init(device: MTLDevice, inDim: Dim) {
+        print(" in dim > \(inDim)")
+        var fourDim: Dim
+        if inDim.cout() == 4 {
+            fourDim = inDim
+        } else if inDim.cout() < 4 {
+            var fourDimNum: [Int] = []
+            for _ in 0..<(4 - inDim.cout()) {
+                fourDimNum.append(1)
+            }
+            fourDimNum.append(contentsOf: inDim.dims)
+            fourDim = Dim.init(inDim: fourDimNum)
+        } else {
+            fatalError(" not support ")
+        }
+        tensorDim = inDim
+        dim = fourDim
+        padToFourDim = fourDim
+        layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
     }
-    tensorDim = inDim
-    dim = fourDim
-    padToFourDim = fourDim
-    layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
-  }
-  
-  private(set) var layout: DataLayout
+    
+    private(set) var layout: DataLayout
 }
 
 extension Texture {
-  public var description: String {
-    return debugDescription
-  }
-  
-  public var debugDescription: String{
-    var str = ""
-    str += "Dim: \(dim) \n value:[ "
-    str += "\(metalTexture)"
-    str += " ]"
-    return str
-  }
-  
+    public var description: String {
+        return debugDescription
+    }
+    
+    public var debugDescription: String{
+        var str = ""
+        str += "Dim: \(dim) \n value:[ "
+        str += "\(metalTexture)"
+        str += " ]"
+        return str
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
index fcedbd36f7f50b348aab97de18c9fee414f182cf..f16344e5005fd678e07a0bf2dc7e115cb5d469a9 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift
@@ -27,7 +27,7 @@ class OpCreator<P: PrecisionType> {
         }
     }
     
-  func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable {
+    func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable {
         guard let opCreator = opCreators[opDesc.type] else {
             throw PaddleMobileError.opError(message: "there is no " + opDesc.type + " yet")
         }
@@ -69,6 +69,6 @@ class OpCreator<P: PrecisionType> {
          gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
          gElementwiseAddPreluType   :     ElementwiseAddPreluOp<P>.creat,
          gFusionConvAddType         :     ConvAddOp<P>.creat]
-  
+    
     private init(){}
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
index 01c22166642a1e16717f2cad3d434d2fb1ed0f76..0af90e411b9d1c7f7d2bebd990de24839ecd58c4 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift
@@ -22,199 +22,199 @@ import Foundation
  */
 
 protocol OpParam {
-  associatedtype OutputType: Variant
-  var output: OutputType { get set }
-  func outputDesc() -> String
-  
-  //associatedtype ParamPrecisionType: PrecisionType
-  init(opDesc: PMOpDesc, inScope: Scope) throws
-  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
-  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
-  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-  
-  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
-  
-  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-  
+    associatedtype OutputType: Variant
+    var output: OutputType { get set }
+    func outputDesc() -> String
+    
+    //associatedtype ParamPrecisionType: PrecisionType
+    init(opDesc: PMOpDesc, inScope: Scope) throws
+    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
+    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
+    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+    
+    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+    
+    static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+    
 }
 
 extension OpParam {
-  func outputDesc() -> String {
-    return output.debugDescription
-  }
-  
-  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
-    guard let mapKeys = map[key], mapKeys.count > 0 else {
-      throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
+    func outputDesc() -> String {
+        return output.debugDescription
+    }
+    
+    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
+        guard let mapKeys = map[key], mapKeys.count > 0 else {
+            throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
+        }
+        guard let variant = from[mapKeys[0]] else {
+            throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
+        }
+        
+        guard let v = variant as? VarType else {
+            throw PaddleMobileError.paramError(message: " type error")
+            
+        }
+        return v
     }
-    guard let variant = from[mapKeys[0]] else {
-      throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
+    
+    static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
+            return tensorVariances
+        } catch let error {
+            throw error
+        }
     }
     
-    guard let v = variant as? VarType else {
-      throw PaddleMobileError.paramError(message: " type error")
-
+    static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
+            return alphaTensor
+        } catch let error {
+            throw error
+        }
+    }
+    
+    
+    static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
+            return tensorImage
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
+            return tensorX
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
+            return tensorBox
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
+            return tensorInput
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
+            return tensorOutput
+        } catch let error {
+            throw error
+        }
+    }
+    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
+            return tensorOutputY
+        } catch let error {
+            throw error
+        }
+    }
+    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
+            return tensorY
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
+            return out
+        } catch let error {
+            throw error
+        }
+    }
+    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
+            return tensorFilter
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
+            return tensorBias
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
+            return tensorMean
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
+            return tensorScale
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+        do {
+            let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
+            return tensorVariance
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
+        guard let attr = attrs[key] else {
+            throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
+        }
+        
+        guard let tAttr = attr as? T else {
+            throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
+        }
+        return tAttr
     }
-    return v
-  }
-  
-  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
-      return tensorVariances
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
-      return alphaTensor
-    } catch let error {
-      throw error
-    }
-  }
-  
-  
-  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
-      return tensorImage
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
-      return tensorX
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
-      return tensorBox
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
-      return tensorInput
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
-      return tensorOutput
-    } catch let error {
-      throw error
-    }
-  }
-  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
-      return tensorOutputY
-    } catch let error {
-      throw error
-    }
-  }
-  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
-      return tensorY
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
-      return out
-    } catch let error {
-      throw error
-    }
-  }
-  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
-      return tensorFilter
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
-      return tensorBias
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
-      return tensorMean
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
-      return tensorScale
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-    do {
-      let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
-      return tensorVariance
-    } catch let error {
-      throw error
-    }
-  }
-  
-  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
-    guard let attr = attrs[key] else {
-      throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
-    }
-    
-    guard let tAttr = attr as? T else {
-      throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
-    }
-    return tAttr
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
index 532d1b661d4cb0e9823e09a9fc82d13af4f40f76..df7a765d2d25220e441f795f8b681163dbd26fc8 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
@@ -16,129 +16,129 @@ import Metal
 import Foundation
 
 protocol Fusion {
-  static func fusionNode() -> Node
-  static func change() -> [String : [(from: String, to: String)]]
-  static func fusionType() -> String
-  static func needCheck() -> [(Int, String)]
+    static func fusionNode() -> Node
+    static func change() -> [String : [(from: String, to: String)]]
+    static func fusionType() -> String
+    static func needCheck() -> [(Int, String)]
 }
 extension Fusion {
-  static func needCheck() -> [(Int, String)] {
-    return []
-  }
+    static func needCheck() -> [(Int, String)] {
+        return []
+    }
 }
 
 protocol Runable {
-  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
-  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
-  func delogOutput()
-  func inputVariant() -> [String : [MTLBuffer]]
-  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
+    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
+    func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
+    func delogOutput()
+    func inputVariant() -> [String : [MTLBuffer]]
+    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
 }
 
 extension Runable where Self: OperatorProtocol{
-  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try runImpl(device: device, buffer: buffer)
-    } catch let error {
-      throw error
+    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try runImpl(device: device, buffer: buffer)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func inputVariant() -> [String : [MTLBuffer]] {
+        //    return [:]
+        fatalError(" op \(type) need implement inputVariant")
+    }
+    
+    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+        fatalError(" need implement ")
     }
-  }
-  
-  func inputVariant() -> [String : [MTLBuffer]] {
-//    return [:]
-    fatalError(" op \(type) need implement inputVariant")
-  }
-  
-  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
-    fatalError(" need implement ")
-  }
-  
-  func delogOutput() {
     
-    print(type + ": has no implementation" )
-  }
+    func delogOutput() {
+        
+        print(type + ": has no implementation" )
+    }
 }
 
 public class InitContext {
-  /// metal 代码加载方式
-  var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib
-  /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
-  var metalLibPath: String? = nil
-  init() {
-    metalLoadMode = .LoadMetalInDefaultLib
-    metalLibPath = nil
-  }
+    /// metal 代码加载方式
+    var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib
+    /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
+    var metalLibPath: String? = nil
+    init() {
+        metalLoadMode = .LoadMetalInDefaultLib
+        metalLibPath = nil
+    }
 }
 
 protocol Creator where Self: OperatorProtocol{
-  associatedtype OpType: OperatorProtocol & Runable & InferShaperable
-  static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType
+    associatedtype OpType: OperatorProtocol & Runable & InferShaperable
+    static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType
 }
 
 extension Creator where Self: OperatorProtocol {
-  static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType {
-    do {
-      return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext)
-    } catch let error {
-      throw error
+    static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType {
+        do {
+            return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext)
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
 
 protocol InferShaperable {
-  func inferShape()
+    func inferShape()
 }
 
 protocol OperatorProtocol {
-  associatedtype ParamType
-  associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
-  var type: String { get }
-  var scope: Scope { get }
-  var inputs: [String : [String]] { get }
-  var paraInputs: [String : [String]] { get set }
-  var outpus: [String : [String]] { get }
-  var attrs: [String : Attr] { get }
-  var para: ParamType { get }
-  var kernel: KerType { get }
-  init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws
+    associatedtype ParamType
+    associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
+    var type: String { get }
+    var scope: Scope { get }
+    var inputs: [String : [String]] { get }
+    var paraInputs: [String : [String]] { get set }
+    var outpus: [String : [String]] { get }
+    var attrs: [String : Attr] { get }
+    var para: ParamType { get }
+    var kernel: KerType { get }
+    init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws
 }
 
 extension OperatorProtocol {
-  static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self {
-    do {
-      return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext)
-    } catch let error {
-      throw error
+    static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self {
+        do {
+            return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext)
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
 
 class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where KernelType.ParamType == ParameterType {
-  required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws {
-    type = opDesc.type
-    scope = inScope
-    inputs = opDesc.inputs
-    outpus = opDesc.outputs
-    attrs =  opDesc.attrs
-    paraInputs = opDesc.paraInputs
-    do {
-      para = try ParamType.init(opDesc:opDesc, inScope: inScope)
-    } catch let error {
-      throw error
+    required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws {
+        type = opDesc.type
+        scope = inScope
+        inputs = opDesc.inputs
+        outpus = opDesc.outputs
+        attrs =  opDesc.attrs
+        paraInputs = opDesc.paraInputs
+        do {
+            para = try ParamType.init(opDesc:opDesc, inScope: inScope)
+        } catch let error {
+            throw error
+        }
+        kernel = KernelType.init(device: device, param: para, initContext: initContext)
     }
-    kernel = KernelType.init(device: device, param: para, initContext: initContext)
-  }
-  
-  typealias ParamType = ParameterType
-  typealias KerType = KernelType
-  let type: String
-  let inputs: [String : [String]]
-  var paraInputs: [String : [String]]
-  let outpus: [String : [String]]
-  let attrs: [String : Attr]
-  let para: ParamType
-  let scope: Scope
-  var kernel: KerType
+    
+    typealias ParamType = ParameterType
+    typealias KerType = KernelType
+    let type: String
+    let inputs: [String : [String]]
+    var paraInputs: [String : [String]]
+    let outpus: [String : [String]]
+    let attrs: [String : Attr]
+    let para: ParamType
+    let scope: Scope
+    var kernel: KerType
 }
 
 // op infos
@@ -202,4 +202,4 @@ let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Out
                gConvAddAddPreluType         : (inputs: ["Input"], outputs: ["Out"]),
                gElementwiseAddPreluType     : (inputs: ["X"], outputs: ["Out"]),
                gFusionConvAddType           : (inputs: ["Input"], outputs: ["Out"])
-              ]
+]
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
index a877620416cb1b12be1ac1ef2a86f198fe75fc60..904e04c468e8b85c2b5d4c01ea828c10444c4692 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift
@@ -16,52 +16,52 @@ import Foundation
 import Metal
 
 class BatchNormParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
-      if input.transpose != [0, 2, 3, 1] {
-        fatalError("batch norm only accepts NHWC")
-      }
-      output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
-      bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
-      mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
-      scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
-      variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
-      epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-      momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
+            if input.transpose != [0, 2, 3, 1] {
+                fatalError("batch norm only accepts NHWC")
+            }
+            output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
+            bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
+            mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
+            scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
+            variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
+            epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+            momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let bias: Tensor<P>
-  let mean: Tensor<P>
-  let scale: Tensor<P>
-  let variance: Tensor<P>
-  let epsilon: Float
-  let momentum: Float
+    let input: Texture
+    var output: Texture
+    let bias: Tensor<P>
+    let mean: Tensor<P>
+    let scale: Tensor<P>
+    let variance: Tensor<P>
+    let epsilon: Float
+    let momentum: Float
 }
 
 class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam<P>>, Runable, Creator, InferShaperable{
-  typealias OpType = BatchNormOp<P>
-
-  func inferShape() {
-    para.output.dim = para.input.dim
-  }
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = BatchNormOp<P>
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
+    }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
index a19dd1039073812b024a55c60bfad8c3c1387e71..e44a49d9004ea9260ff32881b810fbd294d00d29 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift
@@ -16,50 +16,50 @@ import Foundation
 import Metal
 
 class BilinearInterpParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
-      out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+            out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+            fatalError()
+        }
     }
-    if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
-      fatalError()
-    }
-  }
-  let input: Texture
-  var output: Texture
-  let out_h: Int
-  let out_w: Int
+    let input: Texture
+    var output: Texture
+    let out_h: Int
+    let out_w: Int
 }
 
 class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, BilinearInterpParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = BilinearInterpOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = BilinearInterpOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        //    print(outputArray)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-//    print(outputArray)
-    print(outputArray.strideArray())
-  }
-  
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
index 4679885ab6e5c946d9b335f8b59f8537e37ea967..442d1af9ea4629e56979ad970dfbaf85f497d3b6 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift
@@ -15,69 +15,69 @@
 import Foundation
 
 class BoxcoderParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
-      priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
-      targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
-      output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
-      codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
-      boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
+            priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
+            targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
+            output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
+            codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
+            boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        assert(priorBox.tensorDim.cout() == 2)
+        assert(priorBoxVar.tensorDim.cout() == 2)
+        assert(targetBox.tensorDim.cout() == 3)
+        assert(output.tensorDim.cout() == 3)
+        assert(priorBox.transpose == [0, 1, 2, 3])
+        assert(priorBoxVar.transpose == [0, 1, 2, 3])
+        assert(targetBox.transpose == [0, 1, 2, 3])
+        assert(codeType == "decode_center_size") // encode_center_size is not implemented
+        assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
     }
-    assert(priorBox.tensorDim.cout() == 2)
-    assert(priorBoxVar.tensorDim.cout() == 2)
-    assert(targetBox.tensorDim.cout() == 3)
-    assert(output.tensorDim.cout() == 3)
-    assert(priorBox.transpose == [0, 1, 2, 3])
-    assert(priorBoxVar.transpose == [0, 1, 2, 3])
-    assert(targetBox.transpose == [0, 1, 2, 3])
-    assert(codeType == "decode_center_size") // encode_center_size is not implemented
-    assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
-  }
-  let priorBox: Texture
-  let priorBoxVar: Texture
-  let targetBox: Texture
-  var output: Texture
-  let codeType: String
-  let boxNormalized: Bool
+    let priorBox: Texture
+    let priorBoxVar: Texture
+    let targetBox: Texture
+    var output: Texture
+    let codeType: String
+    let boxNormalized: Bool
 }
 
 class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = BoxcoderOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = BoxcoderOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
+        let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
+        let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
+        let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(" prior box var ")
+        print(pbv.strideArray())
+        print(" target box ")
+        print(tb.strideArray())
+        print(" prior box ")
+        print(pb.strideArray())
+        print(" output ")
+        print(out.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
-    let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
-    let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
-    let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(" prior box var ")
-    print(pbv.strideArray())
-    print(" target box ")
-    print(tb.strideArray())
-    print(" prior box ")
-    print(pb.strideArray())
-    print(" output ")
-    print(out.strideArray())
-  }
-  
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
index c2c22d55af6fc33ca69cbc028f149d54285459e7..a8034c681fd8677138fec6fc818be73f3fd8411e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
@@ -15,62 +15,62 @@
 import Foundation
 
 class ConcatParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      guard let xlist = opDesc.inputs["X"] else {
-        fatalError()
-      }
-      for x in xlist {
-        guard let variant = inScope[x], let v = variant as? Texture else {
-          fatalError()
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            guard let xlist = opDesc.inputs["X"] else {
+                fatalError()
+            }
+            for x in xlist {
+                guard let variant = inScope[x], let v = variant as? Texture else {
+                    fatalError()
+                }
+                if transpose.count == 0 {
+                    transpose = v.transpose
+                }
+                if v.transpose != transpose {
+                    fatalError()
+                }
+                
+                input.append(v)
+            }
+            axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
         }
-        if transpose.count == 0 {
-          transpose = v.transpose
-        }
-        if v.transpose != transpose {
-          fatalError()
-        }
-       
-        input.append(v)
-      }
-      axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
-      output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
     }
-  }
-  var input: [Texture] = []
-  var output: Texture
-  var transpose: [Int] = []
-  let axis: Int
+    var input: [Texture] = []
+    var output: Texture
+    var transpose: [Int] = []
+    let axis: Int
 }
 
 class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ConcatOp<P>
-
-  func inferShape() {
-    //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
-    //        para.output.dim = Dim.init(inDim: dim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ConcatOp<P>
+    
+    func inferShape() {
+        //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
+        //        para.output.dim = Dim.init(inDim: dim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
     
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
-  
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
index 552d72f436bf6de89f52bae186f72a0a778b1f4c..e7865045e5485721c5c4137fa8a8be0e7b8680e4 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift
@@ -16,94 +16,94 @@ import Foundation
 import Metal
 
 class ConvAddAddPreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-      y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+            y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  let mode: String
-  let alpha: Tensor<P>
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    let mode: String
+    let alpha: Tensor<P>
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddAddPreluOp<P: PrecisionType>: Operator<ConvAddAddPreluKernel<P>, ConvAddAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvAddAddPreluOp<P>
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddAddPreluType
-  }
-  
-  static func needCheck() -> [(Int, String)] {
-    return [(2, "Y"), (2, "X")]
-  }
-  
-  
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvAddAddPreluOp<P>
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gConvAddAddPreluType
+    }
+    
+    static func needCheck() -> [(Int, String)] {
+        return [(2, "Y"), (2, "X")]
+    }
+    
+    
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
     }
-  }
-  
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
index 6aacd4208e0a46ba6c88f9e2073c6ef3d4753952..311967c22cf0918e7cde986e72834a244516f85c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddBatchNormReluOp.swift
@@ -16,115 +16,115 @@ import Foundation
 
 
 class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      
-      filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-      
-      groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-      bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-      
-      scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-      mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-      y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            
+            filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+            
+            groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+            
+            scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+            mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+            y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  
-  let variance: Tensor<P>
-  let bias: Tensor<P>
-  let mean: Tensor<P>
-  let scale: Tensor<P>
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  let epsilon: Float32
-  var newScale: MTLBuffer?
-  var newBiase: MTLBuffer?
-  
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    
+    let variance: Tensor<P>
+    let bias: Tensor<P>
+    let mean: Tensor<P>
+    let scale: Tensor<P>
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    let epsilon: Float32
+    var newScale: MTLBuffer?
+    var newBiase: MTLBuffer?
+    
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKernel<P>, ConvAddBatchNormReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  
-  typealias OpType = ConvAddBatchNormReluOp<P>
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    typealias OpType = ConvAddBatchNormReluOp<P>
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType)
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
     }
-  }
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType)
-      --> Node.init(inType: gBatchNormType)
-      --> Node.init(inType: gReluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddBatchNormReluType
-  }
-  
-  func delogOutput() {
-    print(" conv add batchnorm relu output ")
-    print(para.output.toTensor().strideArray())
-    //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
-    //        para.filter.logDataPointer(header: "filter data pointer: ")
-    //        print("filter: \(para.filter)")
     
-    //        print("biase: \(para.y)")
-    //        print("padding: \(para.paddings)")
-    //        print("stride: \(para.stride)")
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
     
-    //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
-    //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
-    //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+    static func fusionType() -> String {
+        return gConvAddBatchNormReluType
+    }
     
-    //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
-  }
+    func delogOutput() {
+        print(" conv add batchnorm relu output ")
+        print(para.output.toTensor().strideArray())
+        //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+        //        para.filter.logDataPointer(header: "filter data pointer: ")
+        //        print("filter: \(para.filter)")
+        
+        //        print("biase: \(para.y)")
+        //        print("padding: \(para.paddings)")
+        //        print("stride: \(para.stride)")
+        
+        //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+        //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+        //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+        
+        //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
index 923c2c210ddba99dcebec77ae91299cd28ed638e..7b9958a0666b619de46ee15935233a45e81585dd 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift
@@ -15,103 +15,103 @@
 import Foundation
 
 class ConvAddParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      
-      y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            
+            y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvAddOp<P>
-
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddType
-  }
-  
-  func inferShape() {
+    typealias OpType = ConvAddOp<P>
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType)
+        return beginNode
+    }
     
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    static func fusionType() -> String {
+        return gConvAddType
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func inferShape() {
+        
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-  }
-  
-  func delogOutput() {
-//    print("op \(type): ")
-//    print(" padding: ")
-//    print(para.paddings)
-//    print("stride: ")
-//    print(para.stride)
-//    print("dilations: ")
-//    print(para.dilations)
-//    print(" para input dim: ")
-//    print(para.input.dim)
-//    print(" para filter dim: ")
-//    print(para.filter.dim)
-//    print(" para output dim: ")
-//    print(para.output.dim)
-//    print(" biase: ")
-//    let biase: [Float32] = para.y.buffer.array()
-//    print(biase)
     
-    print(" \(type) output: ")
-    print(para.output.metalTexture)
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-  }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        //    print("op \(type): ")
+        //    print(" padding: ")
+        //    print(para.paddings)
+        //    print("stride: ")
+        //    print(para.stride)
+        //    print("dilations: ")
+        //    print(para.dilations)
+        //    print(" para input dim: ")
+        //    print(para.input.dim)
+        //    print(" para filter dim: ")
+        //    print(para.filter.dim)
+        //    print(" para output dim: ")
+        //    print(para.output.dim)
+        //    print(" biase: ")
+        //    let biase: [Float32] = para.y.buffer.array()
+        //    print(biase)
+        
+        print(" \(type) output: ")
+        print(para.output.metalTexture)
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
index 1c0bbba8d9dba61560ce4be97369fbb406fe238a..dc3205622484d43b86bd0371043ead45df87534a 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddPreluOp.swift
@@ -15,87 +15,87 @@
 import Foundation
 
 class ConvAddPreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-      y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+            y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let y: Tensor<P>
-  let filter: Tensor<P>
-  let mode: String
-  let alpha: Tensor<P>
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let y: Tensor<P>
+    let filter: Tensor<P>
+    let mode: String
+    let alpha: Tensor<P>
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvAddPreluOp<P>
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvAddPreluType
-  }
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvAddPreluOp<P>
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+        return beginNode
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    static func fusionType() -> String {
+        return gConvAddPreluType
+    }
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
index 423e55e391ad9a110fb71af09a16373a322d3d5f..1a973c51ef32e85d699b39eae55ce7afbc06c720 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvBNReluOp.swift
@@ -15,101 +15,101 @@
 import Foundation
 
 class ConvBNReluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-      
-      groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-      bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-      scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-      mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+            
+            groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+            scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+            mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let variance: Tensor<P>
-  let bias: Tensor<P>
-  let mean: Tensor<P>
-  let scale: Tensor<P>
-  let filter: Tensor<P>
-  let epsilon: Float32
-  var newScale: MTLBuffer?
-  var newBiase: MTLBuffer?
-  
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let variance: Tensor<P>
+    let bias: Tensor<P>
+    let mean: Tensor<P>
+    let scale: Tensor<P>
+    let filter: Tensor<P>
+    let epsilon: Float32
+    var newScale: MTLBuffer?
+    var newBiase: MTLBuffer?
+    
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvBNReluOp<P>
-  
-  func inputs() -> [Variant] {
-    return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
-  }
-  
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvBNReluOp<P>
+    
+    func inputs() -> [Variant] {
+        return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
+    }
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
     }
-  }
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gConvType)
-    _ = beginNode
-      --> Node.init(inType: gBatchNormType)
-      --> Node.init(inType: gReluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gConvBnReluType
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
-  
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gConvBnReluType
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
index c66813b166fefd8fe5f139c94d73cf55ff83d682..2d402ae4316d50cf47e648ac9485bac75f47161e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift
@@ -15,67 +15,67 @@
 import Foundation
 
 class ConvParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-      input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
-      stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-      groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
-      
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  let input: Texture
-  let filter: Tensor<P>
-  var output: Texture
-  let stride: [Int32]
-  let paddings: [Int32]
-  let dilations: [Int32]
-  let groups: Int
+    
+    let input: Texture
+    let filter: Tensor<P>
+    var output: Texture
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
 }
 
 class ConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-  typealias OpType = ConvOp<P>
-
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvOp<P>
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print("conv output : ")
+        print(para.output.toTensor().strideArray())
+        //        let _: Float16? = para.output.metalTexture.logDesc()
     }
-  }
-  
-  func delogOutput() {
-    print("conv output : ")
-    print(para.output.toTensor().strideArray())
-    //        let _: Float16? = para.output.metalTexture.logDesc()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
index c035f403a62875da14df291bad01766731caf380..8322263e7c6e558d2e05493dc619c6ebe29fe2e7 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvTransposeOp.swift
@@ -15,44 +15,44 @@
 import Foundation
 
 class ConvTransposeParam<P: PrecisionType>: ConvParam<P> {
-  //typealias ParamPrecisionType = P
+    //typealias ParamPrecisionType = P
     required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      try super.init(opDesc: opDesc, inScope: inScope)
-    } catch let error {
-      throw error
+        do {
+            try super.init(opDesc: opDesc, inScope: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
 }
 
 class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTransposeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ConvTransposeOp<P>
-  
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ConvTransposeOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-  
-    print(" \(type) output: ")
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-      print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
-      print(output.strideArray())
-    } else {
-      print(" not implement")
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        
+        print(" \(type) output: ")
+        let padToFourDim = para.output.padToFourDim
+        if para.output.transpose == [0, 1, 2, 3] {
+            let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+            print(outputArray.strideArray())
+        } else if para.output.transpose == [0, 2, 3, 1] {
+            let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
+            print(output.strideArray())
+        } else {
+            print(" not implement")
+        }
     }
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
index 96818a9fd8bd14a69b249200eb7c32c222096318..4686501fddda1cfe7e15680af6b3984381d842cb 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-
-  typealias OpType = DepthConvOp<P>
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    typealias OpType = DepthConvOp<P>
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
index 8575cfd88c7ddea2f007cad21507b4620c87d3e2..ef3bc21316126c6902eb5086c2aaf79743aac4f7 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/DwConvBNReluOp.swift
@@ -15,56 +15,56 @@
 import Foundation
 
 class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  typealias OpType = ConvBNReluOp<P>
-  
-  func inferShape() {
-    let inDims = para.input.dim
-    let filterDim = para.filter.dim
-    let strides = para.stride
-    let paddings = para.paddings
-    let dilations = para.dilations
+    typealias OpType = ConvBNReluOp<P>
     
-    var outDim = [inDims[0]]
-    for i in 0..<strides.count {
-      let dilation: Int = Int(dilations[i])
-      let filterSize: Int = filterDim[i + 1]
-      let inputSize: Int = inDims[i + 1]
-      let padding: Int = Int(paddings[i])
-      let stride: Int = Int(strides[i])
-      let dKernel = dilation * (filterSize - 1) + 1
-      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-      outDim.append(outputSize)
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
     }
-    outDim.append(filterDim[0])
-    para.output.dim = Dim.init(inDim: outDim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gDepthConvType)
+        _ = beginNode
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gDwConvBnReluType
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
     }
-  }
-  
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gDepthConvType)
-    _ = beginNode
-      --> Node.init(inType: gBatchNormType)
-      --> Node.init(inType: gReluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gDwConvBnReluType
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
index 5fa69d4f44e48603dec9213be78d08b11b433edd..cd5307b5840b2cb756d132f3e60c21b7fe86f484 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift
@@ -16,80 +16,80 @@ import Foundation
 import Metal
 
 class ElementwiseAddParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        do {
+            inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch _ {
+            let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+            let device = inputX.metalTexture!.device
+            inputY = Texture.init(device: device, inDim: tensorY.dim)
+            let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+            inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
+        }
+        
+        //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+        //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+        //      if computePrecision == .Float32 {
+        //        super.init(device: device, inFunctionName: "elementwise_add")
+        //      } else if computePrecision == .Float16 {
+        //        super.init(device: device, inFunctionName: "elementwise_add_half")
+        //      } else {
+        //        fatalError()
+        //      }
+        //    }
+        
+        var offset = axis
+        if axis == -1 {
+            offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+        }
+        for i in 0..<(inputY.tensorDim.cout()) {
+            assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+        }
     }
-    do {
-      inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch _ {
-      let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-      let device = inputX.metalTexture!.device
-      inputY = Texture.init(device: device, inDim: tensorY.dim)
-      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
-    }
-    
-//    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
-//      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
-//      if computePrecision == .Float32 {
-//        super.init(device: device, inFunctionName: "elementwise_add")
-//      } else if computePrecision == .Float16 {
-//        super.init(device: device, inFunctionName: "elementwise_add_half")
-//      } else {
-//        fatalError()
-//      }
-//    }
     
-    var offset = axis
-    if axis == -1 {
-      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
-    }
-    for i in 0..<(inputY.tensorDim.cout()) {
-      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
-    }
-  }
-  
-  var inputX: Texture
-  var inputY: Texture
-  var output: Texture
-  var axis: Int
+    var inputX: Texture
+    var inputY: Texture
+    var output: Texture
+    var axis: Int
 }
 
 class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, ElementwiseAddParam<P>>, Runable, Creator, InferShaperable{
-  typealias OpType = ElementwiseAddOp<P>
-  
-  func inferShape() {
-//    para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = ElementwiseAddOp<P>
+    
+    func inferShape() {
+        //    para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output)
     
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-      print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-    } else {
-      print(" not implement")
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output)
+        
+        let padToFourDim = para.output.padToFourDim
+        if para.output.transpose == [0, 1, 2, 3] {
+            let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+            print(outputArray.strideArray())
+        } else if para.output.transpose == [0, 2, 3, 1] {
+            print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        } else {
+            print(" not implement")
+        }
     }
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
index 6a49d7bfa2fe4f060eedc84d47a8c1f8d64ee4d0..bd853f6c0f5b209da21f9965b30726d0703f11df 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift
@@ -16,101 +16,101 @@ import Foundation
 import Metal
 
 class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-      inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+            inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
+        do {
+            inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch _ {
+            let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+            let device = inputX.metalTexture!.device
+            inputY = Texture.init(device: device, inDim: tensorY.dim)
+            let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+            inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
+        }
+        
+        //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+        //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+        //      if computePrecision == .Float32 {
+        //        super.init(device: device, inFunctionName: "elementwise_add")
+        //      } else if computePrecision == .Float16 {
+        //        super.init(device: device, inFunctionName: "elementwise_add_half")
+        //      } else {
+        //        fatalError()
+        //      }
+        //    }
+        
+        var offset = axis
+        if axis == -1 {
+            offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+        }
+        for i in 0..<(inputY.tensorDim.cout()) {
+            assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+        }
     }
-    do {
-      inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-    } catch _ {
-      let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-      let device = inputX.metalTexture!.device
-      inputY = Texture.init(device: device, inDim: tensorY.dim)
-      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
+    
+    let mode: String
+    let alpha: Tensor<P>
+    var inputX: Texture
+    var inputY: Texture
+    var output: Texture
+    var axis: Int
+}
+
+class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gElementwiseAddType)
+        _ = beginNode
+            --> Node.init(inType: gPreluType)
+        return beginNode
     }
     
-    //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
-    //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
-    //      if computePrecision == .Float32 {
-    //        super.init(device: device, inFunctionName: "elementwise_add")
-    //      } else if computePrecision == .Float16 {
-    //        super.init(device: device, inFunctionName: "elementwise_add_half")
-    //      } else {
-    //        fatalError()
-    //      }
-    //    }
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
     
-    var offset = axis
-    if axis == -1 {
-      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    static func fusionType() -> String {
+        return gElementwiseAddPreluType
     }
-    for i in 0..<(inputY.tensorDim.cout()) {
-      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    
+    typealias OpType = ElementwiseAddPreluOp<P>
+    
+    func inferShape() {
+        //    para.output.dim = para.input.dim
     }
-  }
-  
-  let mode: String
-  let alpha: Tensor<P>
-  var inputX: Texture
-  var inputY: Texture
-  var output: Texture
-  var axis: Int
-}
-
-class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-  static func fusionNode() -> Node {
-    let beginNode = Node.init(inType: gElementwiseAddType)
-    _ = beginNode
-      --> Node.init(inType: gPreluType)
-    return beginNode
-  }
-  
-  static func change() -> [String : [(from: String, to: String)]] {
-    return [:]
-  }
-  
-  static func fusionType() -> String {
-    return gElementwiseAddPreluType
-  }
-  
-  typealias OpType = ElementwiseAddPreluOp<P>
-  
-  func inferShape() {
-    //    para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output)
     
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-      print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-    } else {
-      print(" not implement")
+    
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output)
+        
+        let padToFourDim = para.output.padToFourDim
+        if para.output.transpose == [0, 1, 2, 3] {
+            let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+            print(outputArray.strideArray())
+        } else if para.output.transpose == [0, 2, 3, 1] {
+            print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        } else {
+            print(" not implement")
+        }
     }
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
index 46defcb58332a02cbc365a087708e792a66c6e5c..bab3d8dce740291138a4f328907d521bf132e2bf 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift
@@ -17,54 +17,54 @@ import MetalKit
 import CoreMedia
 
 class FeedParam<P: PrecisionType>: OpParam{
-  var output: Texture
-  var input: InputTexture {
-    return scope.input() as! InputTexture
-  }
-  let scope: Scope
-  
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    scope = inScope
-    do {
-      output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
+    var output: Texture
+    var input: InputTexture {
+        return scope.input() as! InputTexture
     }
-  }
-  
-  //typealias ParamPrecisionType = P
+    let scope: Scope
+    
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        scope = inScope
+        do {
+            output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    //typealias ParamPrecisionType = P
 }
 
 class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<P>>, Runable, Creator, InferShaperable {
-  typealias OpType = FeedOp<P>
-
-  func inferShape() {
-    //        print("feed  input: \(para.input.expectDim)")
-    print("feed output: \(para.output.dim)")
-    //        para.output.dim =
-    //        para.output.dim = para.input.expectDim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = FeedOp<P>
+    
+    func inferShape() {
+        //        print("feed  input: \(para.input.expectDim)")
+        print("feed output: \(para.output.dim)")
+        //        para.output.dim =
+        //        para.output.dim = para.input.expectDim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+        
+        //        let resizeKernel = ResizeKernel<P>.init(device: device)
+        //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
+        //        do {
+        //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
+        //        } catch let error {
+        //            throw error
+        //        }
     }
     
-    //        let resizeKernel = ResizeKernel<P>.init(device: device)
-    //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
-    //        do {
-    //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
-    //        } catch let error {
-    //            throw error
-    //        }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture)
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray())
-  }
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture)
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray())
+    }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
index a5d04a4b03a182a4e843a31628bd2892de597093..671c2f33faa29620cf423166b0f6f71849a66e67 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
@@ -16,43 +16,43 @@ import Foundation
 import Metal
 
 class FetchParam<P: PrecisionType>: OpParam{
-  var output: FetchHolder
-  let input: Texture
-  let scope: Scope
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    scope = inScope
-    do {
-      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim)
-      scope.setOutput(output: output)
-    } catch let error {
-      throw error
+    var output: FetchHolder
+    let input: Texture
+    let scope: Scope
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        scope = inScope
+        do {
+            input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim)
+            scope.setOutput(output: output)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  //typealias ParamPrecisionType = P
+    
+    //typealias ParamPrecisionType = P
 }
 
 class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
-  
-  typealias OpType = FetchOp<P>
-
-  func inferShape() {
-    print(para.input.dim)
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = FetchOp<P>
+    
+    func inferShape() {
+        print(para.input.dim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print("fetch output: ")
+        let resArr = self.para.output.result.floatArr(count: self.para.output.capacity)
+        print(resArr.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print("fetch output: ")
-    let resArr = self.para.output.result.floatArr(count: self.para.output.capacity)
-    print(resArr.strideArray())
-  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
index 8500798adc75f9fac9e960857e9b0de319157c95..b98299085167f7cb5dbf090d7cc48f6e779a2294 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift
@@ -15,45 +15,45 @@
 import Foundation
 
 class FlattenParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let axis: Int
+    let input: Texture
+    var output: Texture
+    let axis: Int
 }
 
 
 class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = FlattenOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = FlattenOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
+    }
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
index a7aaa9eddc49361ef718e5f6e627face49bc43b0..43ce7927ebf90c5ccc2ae1acf7df8f3f6b681863 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift
@@ -19,125 +19,125 @@ public protocol TestParam {
 }
 
 public protocol Testable {
-  associatedtype TestParamType: TestParam
-  func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
-  init(device: MTLDevice, testParam: TestParamType, initContext: InitContext)
+    associatedtype TestParamType: TestParam
+    func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
+    init(device: MTLDevice, testParam: TestParamType, initContext: InitContext)
 }
 
 
 protocol Computable {
-  associatedtype ParamType: OpParam
-  func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
-  init(device: MTLDevice, param: ParamType, initContext: InitContext)
+    associatedtype ParamType: OpParam
+    func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
+    init(device: MTLDevice, param: ParamType, initContext: InitContext)
 }
 
 protocol KernelProtocol {
-  var pipline: MTLComputePipelineState { get set }
-  var functionName: String { get set }
-  
+    var pipline: MTLComputePipelineState { get set }
+    var functionName: String { get set }
+    
 }
 
 @objc open class Kernel: NSObject{
-  let pipline: MTLComputePipelineState
-  let functionName: String
-  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) {
-    pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-    functionName = inFunctionName
-  }
+    let pipline: MTLComputePipelineState
+    let functionName: String
+    public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) {
+        pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
+        functionName = inFunctionName
+    }
 }
 
 @objc public class Shape: NSObject {
-  public let width: Int
-  public let height: Int
-  public let channel: Int
-  @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){
-    width = inWidth
-    height = inHeight
-    channel = inChannel
-  }
+    public let width: Int
+    public let height: Int
+    public let channel: Int
+    @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){
+        width = inWidth
+        height = inHeight
+        channel = inChannel
+    }
 }
 
 open class BufferToTextureKernel: Kernel {
-  public let outputTexture: MTLTexture
-  
-  public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.textureType = .type2D
-    textureDesc.width = outputDim.width
-    textureDesc.height = outputDim.height
-    textureDesc.depth = (outputDim.channel + 3) / 4
+    public let outputTexture: MTLTexture
     
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      textureDesc.pixelFormat = .rgba16Float
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      textureDesc.pixelFormat = .rgba32Float
-    } else {
-      fatalError()
+    public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.textureType = .type2D
+        textureDesc.width = outputDim.width
+        textureDesc.height = outputDim.height
+        textureDesc.depth = (outputDim.channel + 3) / 4
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            textureDesc.pixelFormat = .rgba16Float
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            textureDesc.pixelFormat = .rgba32Float
+        } else {
+            fatalError()
+        }
+        
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.storageMode = .shared
+        outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+        let initContext = InitContext.init()
+        initContext.metalLibPath = metalLibPath
+        initContext.metalLoadMode = metalLoadMode
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext)
+        } else {
+            super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext)
+        }
     }
     
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.storageMode = .shared
-    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-    let initContext = InitContext.init()
-    initContext.metalLibPath = metalLibPath
-    initContext.metalLoadMode = metalLoadMode
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext)
-    } else {
-      super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext)
-    }
-  }
-  
-  public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setBuffer(inputBuffer, offset: 0, index: 0)
+        encoder.setTexture(outputTexture, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+        encoder.endEncoding()
     }
     
-    encoder.setBuffer(inputBuffer, offset: 0, index: 0)
-    encoder.setTexture(outputTexture, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
-    encoder.endEncoding()
-  }
-
 }
 
 @objc open class CusomKernel: Kernel {
-
-  public let outputTexture: MTLTexture
-  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) {
-    let textureDesc = MTLTextureDescriptor.init()
-    textureDesc.textureType = .type2D
-    textureDesc.width = outputDim.width
-    textureDesc.height = outputDim.height
-    textureDesc.depth = (outputDim.channel + 3) / 4
     
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      textureDesc.pixelFormat = .rgba16Float
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      textureDesc.pixelFormat = .rgba32Float
-    } else {
-      fatalError()
+    public let outputTexture: MTLTexture
+    public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) {
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.textureType = .type2D
+        textureDesc.width = outputDim.width
+        textureDesc.height = outputDim.height
+        textureDesc.depth = (outputDim.channel + 3) / 4
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            textureDesc.pixelFormat = .rgba16Float
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            textureDesc.pixelFormat = .rgba32Float
+        } else {
+            fatalError()
+        }
+        
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.storageMode = .shared
+        outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+        
+        let context = InitContext.init()
+        context.metalLoadMode = metalLoadModel
+        context.metalLibPath = metalLibPath
+        super.init(device: device, inFunctionName: inFunctionName, initContext: context)
     }
     
-    textureDesc.usage = [.shaderRead, .shaderWrite]
-    textureDesc.storageMode = .shared
-    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-    
-    let context = InitContext.init()
-    context.metalLoadMode = metalLoadModel
-    context.metalLibPath = metalLibPath
-    super.init(device: device, inFunctionName: inFunctionName, initContext: context)
-  }
-  
-  public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(inputTexuture, index: 0)
+        encoder.setTexture(outputTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(inputTexuture, index: 0)
-    encoder.setTexture(outputTexture, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
-    encoder.endEncoding()
-  }
-  
+    
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
index 9eeb2aff9cdb3d476be93b75b5e642430f531331..0e2005b024b94a28d4476624e507b10cc9d88f72 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift
@@ -15,39 +15,39 @@
 import Foundation
 
 class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
-  required init(device: MTLDevice, param: BatchNormParam<P>, initContext: InitContext) {
-    let count = param.variance.dim.numel()
-    let varianceP = param.variance.data.pointer
-    let meanP = param.mean.data.pointer
-    let scaleP = param.scale.data.pointer
-    let biasP = param.bias.data.pointer
-    for i in 0..<count {
-      let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
-      biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
-      scaleP[i] = invStd * scaleP[i]
+    required init(device: MTLDevice, param: BatchNormParam<P>, initContext: InitContext) {
+        let count = param.variance.dim.numel()
+        let varianceP = param.variance.data.pointer
+        let meanP = param.mean.data.pointer
+        let scaleP = param.scale.data.pointer
+        let biasP = param.bias.data.pointer
+        for i in 0..<count {
+            let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
+            biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
+            scaleP[i] = invStd * scaleP[i]
+        }
+        
+        param.bias.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.scale.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "batchnorm", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "batchnorm_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-
-    param.bias.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.scale.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "batchnorm", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "batchnorm_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
+        encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
-    encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
index 0db2e98651df8a7d778b7b9754ba1d059a54f365..c8a65190851c7af154dc6c1f61cbde05db243632 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 struct BilinearInterpMetalParam {
-  var ratio_h: Float32
-  var ratio_w: Float32
+    var ratio_h: Float32
+    var ratio_w: Float32
 }
 
 class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        var ratio_h: Float32 = 0
+        var ratio_w: Float32 = 0
+        if param.output.tensorDim.dims[2] > 1 {
+            ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+        }
+        if param.output.tensorDim.dims[3] > 1 {
+            ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
+        }
+        var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+        encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    var ratio_h: Float32 = 0
-    var ratio_w: Float32 = 0
-    if param.output.tensorDim.dims[2] > 1 {
-      ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+    required init(device: MTLDevice, param: BilinearInterpParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    if param.output.tensorDim.dims[3] > 1 {
-      ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
-    }
-    var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-    encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: BilinearInterpParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
index 6e528a59650f017da0e50dff56f748e0255d6eee..8f295672c19991897d85ffb6622dfb840d0c0b50 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift
@@ -18,29 +18,29 @@ struct BoxcoderMetalParam {
 }
 
 class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.priorBox.metalTexture, index: 0)
+        encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
+        encoder.setTexture(param.targetBox.metalTexture, index: 2)
+        encoder.setTexture(param.output.metalTexture, index: 3)
+        var bmp = BoxcoderMetalParam.init()
+        encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.priorBox.metalTexture, index: 0)
-    encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
-    encoder.setTexture(param.targetBox.metalTexture, index: 2)
-    encoder.setTexture(param.output.metalTexture, index: 3)
-    var bmp = BoxcoderMetalParam.init()
-    encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: BoxcoderParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    required init(device: MTLDevice, param: BoxcoderParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
index edb028968838f3ec8c11b45b649463da9f1d9ea1..195366c79668f7553848f969be03fbddb36d7905 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift
@@ -16,133 +16,133 @@ import Foundation
 import Metal
 
 struct ConcatTestParam: TestParam {
-  var input: [MTLTexture]
-  var output: MTLTexture
-  var dims: [[Int]]
-  var axis: Int
-  var odim: [Int]
+    var input: [MTLTexture]
+    var output: MTLTexture
+    var dims: [[Int]]
+    var axis: Int
+    var odim: [Int]
 }
 
 struct ConcatMetalParam {
-  var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
-  var axis: Int32 = 0
-  var offset: Int32 = 0
-  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-  var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
+    var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+    var axis: Int32 = 0
+    var offset: Int32 = 0
+    var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
 }
 
 class ConcatKernel<P: PrecisionType>: Kernel, Computable{
-  var v = "normal"
-  var pm = ConcatMetalParam.init()
-  func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
-    
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    let num = param.input.count
-    for i in 0..<num {
-      encoder.setTexture(param.input[i].metalTexture, index: i)
-    }
-    encoder.setTexture(param.output.metalTexture, index: num)
-    if v == "normal" {
-      encoder.setTexture(param.output.metalTexture, index: num + 1)
-    }
-    encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-
-  required init(device: MTLDevice, param: ConcatParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    let orank = param.output.tensorDim.cout()
-    let num = param.input.count
-    assert(num <= 6)
-    var axis = 4 - param.output.tensorDim.cout() + param.axis
-    for i in 0..<4 {
-      if param.transpose[i] == axis {
-        axis = i
-        break
-      }
-    }
-    pm.axis = Int32(axis)
-    pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
-    pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
-    var vdim: [Int] = [0, 0, 0, 0, 0, 0]
-    for i in 0..<num {
-      vdim[i] = param.input[i].dim[axis]
-    }
-    if orank == 4 {
-      if axis == 1 {
-        v = "y"
-      } else if axis == 2 {
-        v = "x"
-      } else {
-        if (param.output.dim[0] == 1) && axis == 3 {
-          var vz = true
-          for i in 0..<num {
-            if vdim[i] % 4 != 0 {
-              vz = false
-              break
-            }
-          }
-          if vz {
-            v = "z"
-            for i in 0..<num {
-              vdim[i] = vdim[i] / 4
-            }
-          }
+    var v = "normal"
+    var pm = ConcatMetalParam.init()
+    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+        
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
         }
-      }
-    } else if orank == 3 {
-      if axis == 2 {
-        v = "y"
-      } else if axis == 3 {
-        v = "x"
-      } else if axis == 1 {
-        var vz = true
+        let num = param.input.count
         for i in 0..<num {
-          if vdim[i] % 4 != 0 {
-            vz = false
-            break
-          }
+            encoder.setTexture(param.input[i].metalTexture, index: i)
         }
-        if vz {
-          v = "z"
-          for i in 0..<num {
-            vdim[i] = vdim[i] / 4
-          }
+        encoder.setTexture(param.output.metalTexture, index: num)
+        if v == "normal" {
+            encoder.setTexture(param.output.metalTexture, index: num + 1)
         }
-      }
-    } else {
-      if axis == 2 {
-        v = "y"
-      } else if axis == 3 {
-        var vx = true
+        encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: ConcatParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        let orank = param.output.tensorDim.cout()
+        let num = param.input.count
+        assert(num <= 6)
+        var axis = 4 - param.output.tensorDim.cout() + param.axis
+        for i in 0..<4 {
+            if param.transpose[i] == axis {
+                axis = i
+                break
+            }
+        }
+        pm.axis = Int32(axis)
+        pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
+        pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
+        var vdim: [Int] = [0, 0, 0, 0, 0, 0]
         for i in 0..<num {
-          if vdim[i] % 4 != 0 {
-            vx = false
-            break
-          }
+            vdim[i] = param.input[i].dim[axis]
         }
-        if vx {
-          v = "x"
-          for i in 0..<num {
-            vdim[i] = vdim[i] / 4
-          }
+        if orank == 4 {
+            if axis == 1 {
+                v = "y"
+            } else if axis == 2 {
+                v = "x"
+            } else {
+                if (param.output.dim[0] == 1) && axis == 3 {
+                    var vz = true
+                    for i in 0..<num {
+                        if vdim[i] % 4 != 0 {
+                            vz = false
+                            break
+                        }
+                    }
+                    if vz {
+                        v = "z"
+                        for i in 0..<num {
+                            vdim[i] = vdim[i] / 4
+                        }
+                    }
+                }
+            }
+        } else if orank == 3 {
+            if axis == 2 {
+                v = "y"
+            } else if axis == 3 {
+                v = "x"
+            } else if axis == 1 {
+                var vz = true
+                for i in 0..<num {
+                    if vdim[i] % 4 != 0 {
+                        vz = false
+                        break
+                    }
+                }
+                if vz {
+                    v = "z"
+                    for i in 0..<num {
+                        vdim[i] = vdim[i] / 4
+                    }
+                }
+            }
+        } else {
+            if axis == 2 {
+                v = "y"
+            } else if axis == 3 {
+                var vx = true
+                for i in 0..<num {
+                    if vdim[i] % 4 != 0 {
+                        vx = false
+                        break
+                    }
+                }
+                if vx {
+                    v = "x"
+                    for i in 0..<num {
+                        vdim[i] = vdim[i] / 4
+                    }
+                }
+            }
+        }
+        pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half", initContext: initContext)
+        } else {
+            fatalError()
         }
-      }
     }
-    pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    required init(device: MTLDevice, testParam: ConcatTestParam, initContext: InitContext) {
+        super.init(device: device, inFunctionName: "concat", initContext: initContext)
     }
-  }
-  
-  required init(device: MTLDevice, testParam: ConcatTestParam, initContext: InitContext) {
-    super.init(device: device, inFunctionName: "concat", initContext: initContext)
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
index 650f1b449785e5ed50cede4b3e59c9e8c22e9f4b..b2dd3064469ef465f785c8e7d5153abce9d687a3 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -15,136 +15,136 @@
 import Foundation
 
 class ConvAddAddPreluKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
-        }
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvAddAddPreluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
         
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
         } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+            fatalError()
         }
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
+        let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else {
-      fatalError()
+        let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+        
+        //    print(" function: \(functionName)")
+        //    print("offset x: \(offsetX)")
+        //    print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        //    print("metal param: ")
+        //    print(inMetalParam)
+        
+        metalParam = inMetalParam
     }
     
-    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
-    
-    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
-    
-    //    print(" function: \(functionName)")
-    //    print("offset x: \(offsetX)")
-    //    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    //    print("metal param: ")
-    //    print(inMetalParam)
-    
-    metalParam = inMetalParam
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index 6274e3df8f6e588ecee75ac243c3abe1b5f45828..0ff0b57f6c8497cf916804b152fbd57a301c6a5c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -16,165 +16,165 @@ import Foundation
 import Metal
 
 struct ConvAddBatchNormReluTestParam: TestParam {
-  let inputTexture: MTLTexture
-  let outputTexture: MTLTexture
-  var metalParam: MetalConvParam
-  let filterBuffer: MTLBuffer
-  let biaseBuffer: MTLBuffer
-  let newScaleBuffer: MTLBuffer
-  let newBiaseBuffer: MTLBuffer
-  let filterSize: (width: Int, height: Int, channel: Int)
-  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-    inputTexture = inInputTexture
-    outputTexture = inOutputTexture
-    metalParam = inMetalParam
-    filterBuffer = inFilterBuffer
-    biaseBuffer = inBiaseBuffer
-    newScaleBuffer = inNewScaleBuffer
-    newBiaseBuffer = inNewBiaseBuffer
-    filterSize = inFilterSize
-  }
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    var metalParam: MetalConvParam
+    let filterBuffer: MTLBuffer
+    let biaseBuffer: MTLBuffer
+    let newScaleBuffer: MTLBuffer
+    let newBiaseBuffer: MTLBuffer
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+        inputTexture = inInputTexture
+        outputTexture = inOutputTexture
+        metalParam = inMetalParam
+        filterBuffer = inFilterBuffer
+        biaseBuffer = inBiaseBuffer
+        newScaleBuffer = inNewScaleBuffer
+        newBiaseBuffer = inNewBiaseBuffer
+        filterSize = inFilterSize
+    }
 }
 
 class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) {
-    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
-    } else if testParam.filterSize.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
-    } else {
-      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
+    required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) {
+        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
+        } else if testParam.filterSize.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
+        } else {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
+        }
     }
-  }
-  
-  var metalParam: MetalConvParam!
-  
-  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.variance.initBuffer(device: device, precision: .Float32)
-    param.mean.initBuffer(device: device, precision: .Float32)
-    param.scale.initBuffer(device: device, precision: .Float32)
-    param.bias.initBuffer(device: device, precision: .Float32)
     
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else {
-      fatalError()
+    var metalParam: MetalConvParam!
+    
+    required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.variance.initBuffer(device: device, precision: .Float32)
+        param.mean.initBuffer(device: device, precision: .Float32)
+        param.scale.initBuffer(device: device, precision: .Float32)
+        param.bias.initBuffer(device: device, precision: .Float32)
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        let offsetX = param.filter.width/2 - Int(param.paddings[0])
+        let offsetY = param.filter.height/2 - Int(param.paddings[1])
+        
+        print("offset x: \(offsetX)")
+        print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        
+        var invs: [P] = []
+        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+            invs.append(P(inv))
+        }
+        
+        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+        
+        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+            newScale[i] = invs[i] * scaleContents[i]
+            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+        }
+        
+        //    var newScaleFP16: UnsafeMutableRawPointer
+        //
+        //    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
+        
+        
+        //    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
+        
+        var newBiaseBuffer: MTLBuffer
+        var newScaleBuffer: MTLBuffer
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+            newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            
+            newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            
+            float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+            
+            float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+        } else {
+            fatalError(" unsupport ")
+        }
+        
+        param.newBiase = newBiaseBuffer
+        param.newScale = newScaleBuffer
+        
+        newScale.deinitialize(count: param.scale.buffer.length)
+        newScale.deallocate()
+        
+        newBiase.deinitialize(count: param.bias.buffer.length)
+        newBiase.deallocate()
     }
     
-    let offsetX = param.filter.width/2 - Int(param.paddings[0])
-    let offsetY = param.filter.height/2 - Int(param.paddings[1])
-    
-    print("offset x: \(offsetX)")
-    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    
-    var invs: [P] = []
-    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-    
-    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-      invs.append(P(inv))
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-    
-    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-      newScale[i] = invs[i] * scaleContents[i]
-      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        
+        encoder.setTexture(param.inputTexture, index: 0)
+        encoder.setTexture(param.outputTexture, index: 1)
+        var inMetalParam = param.metalParam
+        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+        encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+        encoder.endEncoding()
     }
-    
-//    var newScaleFP16: UnsafeMutableRawPointer
-//
-//    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
-    
-    
-//    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
-    
-    var newBiaseBuffer: MTLBuffer
-    var newScaleBuffer: MTLBuffer
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
-      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      
-      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      
-      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
-      
-      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
-    } else {
-      fatalError(" unsupport ")
-    }
-    
-    param.newBiase = newBiaseBuffer
-    param.newScale = newScaleBuffer
-    
-    newScale.deinitialize(count: param.scale.buffer.length)
-    newScale.deallocate()
-    
-    newBiase.deinitialize(count: param.bias.buffer.length)
-    newBiase.deallocate()
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-    encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      fatalError()
-    }
-    
-    encoder.setTexture(param.inputTexture, index: 0)
-    encoder.setTexture(param.outputTexture, index: 1)
-    var inMetalParam = param.metalParam
-    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-    encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
-    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
-    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
-    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
index 0ba448161f4596b45797aec7ef186949de277c26..d40fa7e4456c22adc08f4a076bad50ac438a91f8 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -15,74 +15,74 @@
 import Foundation
 
 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext)
-      } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext)
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext)
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext)
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext)
-      } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext)
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext)
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext)
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else {
-      fatalError()
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext)
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext)
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext)
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext)
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext)
+            } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
+                super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext)
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext)
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext)
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        
+        
+        let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+        
+        let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+        
+        //    print(" function: \(functionName)")
+        //    print("offset x: \(offsetX)")
+        //    print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        //    print("metal param: ")
+        //    print(inMetalParam)
+        
+        metalParam = inMetalParam
     }
     
-
-    
-    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
-    
-    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
-    
-//    print(" function: \(functionName)")
-//    print("offset x: \(offsetX)")
-//    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-//    print("metal param: ")
-//    print(inMetalParam)
-    
-    metalParam = inMetalParam
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
index 1d66696050f6db9c9c0ab041f9ad4b7ed2369648..1b054cb9cab80c0cb053e2cdbf8d0af6779ee477 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
@@ -15,136 +15,136 @@
 import Foundation
 
 class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
-        }
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvAddPreluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
         
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.channel == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
+                }
+                
+            } else if param.filter.width == 1 && param.filter.height == 5 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
+                }
+            } else if param.filter.width == 5 && param.filter.height == 1 {
+                if param.mode == "channel" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
+                } else if param.mode == "element" {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
+                } else {
+                    super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
+                }
+            } else {
+                fatalError(" unsupport yet ")
+            }
         } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
+            fatalError()
         }
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.channel == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
-        }
+        let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
         
-      } else if param.filter.width == 1 && param.filter.height == 5 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
-        }
-      } else if param.filter.width == 5 && param.filter.height == 1 {
-        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
-        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
-        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
-        }
-      } else {
-        fatalError(" unsupport yet ")
-      }
-    } else {
-      fatalError()
+        let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+        
+        //    print(" function: \(functionName)")
+        //    print("offset x: \(offsetX)")
+        //    print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        //    print("metal param: ")
+        //    print(inMetalParam)
+        
+        metalParam = inMetalParam
     }
     
-    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
-    
-    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
-    
-    //    print(" function: \(functionName)")
-    //    print("offset x: \(offsetX)")
-    //    print("offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    //    print("metal param: ")
-    //    print(inMetalParam)
-    
-    metalParam = inMetalParam
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
index 81c53a57a81155a9d4e804472764e3e0dab28fa6..415ec94b517405ad15f331bd3fcebf8414fbf754 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
@@ -16,165 +16,165 @@ import Foundation
 import MetalPerformanceShaders
 
 struct ConvBNReluTestParam: TestParam {
-  let inputTexture: MTLTexture
-  let outputTexture: MTLTexture
-  var metalParam: MetalConvParam
-  let filterBuffer: MTLBuffer
-  let biaseBuffer: MTLBuffer
-  let newScaleBuffer: MTLBuffer
-  let newBiaseBuffer: MTLBuffer
-  let filterSize: (width: Int, height: Int, channel: Int)
-  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-    
-    inputTexture = inInputTexture
-    outputTexture = inOutputTexture
-    metalParam = inMetalParam
-    filterBuffer = inFilterBuffer
-    biaseBuffer = inBiaseBuffer
-    newScaleBuffer = inNewScaleBuffer
-    newBiaseBuffer = inNewBiaseBuffer
-    filterSize = inFilterSize
-  }
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    var metalParam: MetalConvParam
+    let filterBuffer: MTLBuffer
+    let biaseBuffer: MTLBuffer
+    let newScaleBuffer: MTLBuffer
+    let newBiaseBuffer: MTLBuffer
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+        
+        inputTexture = inInputTexture
+        outputTexture = inOutputTexture
+        metalParam = inMetalParam
+        filterBuffer = inFilterBuffer
+        biaseBuffer = inBiaseBuffer
+        newScaleBuffer = inNewScaleBuffer
+        newBiaseBuffer = inNewBiaseBuffer
+        filterSize = inFilterSize
+    }
 }
 
 class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-  required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) {
-    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
-    } else if testParam.filterSize.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
-    } else {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
+    required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) {
+        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+            super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
+        } else if testParam.filterSize.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
+        } else {
+            super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
+        }
     }
-  }
-  
-  var metalParam: MetalConvParam!
-
-  required init(device: MTLDevice, param: ConvBNReluParam<P>, initContext: InitContext) {
     
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.variance.initBuffer(device: device, precision: .Float32)
-    param.mean.initBuffer(device: device, precision: .Float32)
-    param.scale.initBuffer(device: device, precision: .Float32)
-    param.bias.initBuffer(device: device, precision: .Float32)
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext)
-      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext)
-      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext)
-      } else {
-        fatalError(" unsupport ")
-      }
-    } else {
-      fatalError()
+    var metalParam: MetalConvParam!
+    
+    required init(device: MTLDevice, param: ConvBNReluParam<P>, initContext: InitContext) {
+        
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.variance.initBuffer(device: device, precision: .Float32)
+        param.mean.initBuffer(device: device, precision: .Float32)
+        param.scale.initBuffer(device: device, precision: .Float32)
+        param.bias.initBuffer(device: device, precision: .Float32)
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.filter.width == 1 && param.filter.height == 1 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext)
+            } else if param.filter.channel == 1 {
+                super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext)
+            } else if param.filter.width == 3 && param.filter.height == 3 {
+                super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext)
+            } else {
+                fatalError(" unsupport ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        
+        
+        let offsetX = param.filter.width/2 - Int(param.paddings[0])
+        let offsetY = param.filter.height/2 - Int(param.paddings[1])
+        
+        //    print(" param filter width: \(param.filter.width)")
+        //    print(" param filter height: \(param.filter.height)")
+        //
+        //    print(" param paddings: \(param.paddings)")
+        //
+        //    print("ConvBNReluKernel offset x: \(offsetX)")
+        //    print("ConvBNReluKernel offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        
+        var invs: [P] = []
+        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+            invs.append(P(inv))
+        }
+        
+        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+        
+        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+            newScale[i] = invs[i] * scaleContents[i]
+            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+        }
+        
+        var newBiaseBuffer: MTLBuffer
+        var newScaleBuffer: MTLBuffer
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+            newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            
+            newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+            
+            float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+            
+            float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+        } else {
+            fatalError(" unsupport ")
+        }
+        
+        param.newBiase = newBiaseBuffer
+        param.newScale = newScaleBuffer
+        
+        newScale.deinitialize(count: param.scale.buffer.length)
+        newScale.deallocate()
+        
+        newBiase.deinitialize(count: param.bias.buffer.length)
+        newBiase.deallocate()
     }
     
-   
-    
-    let offsetX = param.filter.width/2 - Int(param.paddings[0])
-    let offsetY = param.filter.height/2 - Int(param.paddings[1])
-    
-//    print(" param filter width: \(param.filter.width)")
-//    print(" param filter height: \(param.filter.height)")
-//
-//    print(" param paddings: \(param.paddings)")
-//
-//    print("ConvBNReluKernel offset x: \(offsetX)")
-//    print("ConvBNReluKernel offset y: \(offsetY)")
-    
-    let offsetZ = 0.0
-    
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-    
-    var invs: [P] = []
-    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-    
-    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-      invs.append(P(inv))
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.newScale!, offset: 0, index: 2)
+        encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-    
-    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-      newScale[i] = invs[i] * scaleContents[i]
-      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        
+        encoder.setTexture(param.inputTexture, index: 0)
+        encoder.setTexture(param.outputTexture, index: 1)
+        var inMetalParam = param.metalParam
+        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
+        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
+        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+        encoder.endEncoding()
     }
-    
-    var newBiaseBuffer: MTLBuffer
-    var newScaleBuffer: MTLBuffer
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
-      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      
-      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
-      
-      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
-      
-      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
-    } else {
-      fatalError(" unsupport ")
-    }
-    
-    param.newBiase = newBiaseBuffer
-    param.newScale = newScaleBuffer
-    
-    newScale.deinitialize(count: param.scale.buffer.length)
-    newScale.deallocate()
-    
-    newBiase.deinitialize(count: param.bias.buffer.length)
-    newBiase.deallocate()
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.setBuffer(param.newScale!, offset: 0, index: 2)
-    encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      fatalError()
-    }
-    
-    encoder.setTexture(param.inputTexture, index: 0)
-    encoder.setTexture(param.outputTexture, index: 1)
-    var inMetalParam = param.metalParam
-    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
-    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
-    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
index 7571bc155b6a38afdcec7f646be6927f45f4b13a..7ff040219e9d5277f6becbbaf353386008863c97 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
@@ -15,48 +15,46 @@
 import Foundation
 
 public struct MetalConvParam {
-  let offsetX: Int16
-  let offsetY: Int16
-  let offsetZ: Int16
-  let strideX: UInt16
-  let strideY: UInt16
-  let dilationX: UInt16
-  let dilationY: UInt16
+    let offsetX: Int16
+    let offsetY: Int16
+    let offsetZ: Int16
+    let strideX: UInt16
+    let strideY: UInt16
+    let dilationX: UInt16
+    let dilationY: UInt16
 }
 
 class ConvKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) {
-    param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
-    if param.filter.width == 1 && param.filter.height == 1 {
-      super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext)
-    } else if param.filter.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
-    } else if param.filter.width == 3 && param.filter.height == 3 {
-      super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
-    } else {
-      fatalError(" unsupport ")
+    var metalParam: MetalConvParam!
+    required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) {
+        param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
+        if param.filter.width == 1 && param.filter.height == 1 {
+            super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext)
+        } else if param.filter.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
+        } else if param.filter.width == 3 && param.filter.height == 3 {
+            super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
+        } else {
+            fatalError(" unsupport ")
+        }
+        
+        let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
+        let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
+        let offsetZ = 0.0
+        
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
     }
-
-    let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
-    let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
-    let offsetZ = 0.0
     
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
index c8b1361649f40237e1527744ea9ba2ad8b1648c1..f1753d0a09294e82eae5c01c23d4fe1c7b220c8d 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift
@@ -15,69 +15,69 @@
 import Foundation
 
 struct MetalConvTransposeParam {
-  let kernelW: UInt16;
-  let kernelH: UInt16;
-  
-  let strideX: UInt16;
-  let strideY: UInt16;
-  
-  let paddingX: UInt16;
-  let paddingY: UInt16;
-  
-  let dilationX: UInt16;
-  let dilationY: UInt16;
-}
-
-class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
-  var metalParam: MetalConvTransposeParam!
-  required init(device: MTLDevice, param: ConvTransposeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.stride == [2, 2] && param.stride == [2, 2] {
-        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext)
-      } else {
-        fatalError(" -- conv transpose unsupported yet -- ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.stride == [2, 2] && param.stride == [2, 2] {
-        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext)
-      } else {
-        fatalError(" -- conv transpose unsupported yet -- ")
-      }
-    } else {
-      fatalError()
-    }
+    let kernelW: UInt16;
+    let kernelH: UInt16;
     
-//    let filter: [Float32] = param.filter.buffer.array()
-//    print(" conv transpose filter")
-//    print(filter)
-    let kernelWidth = UInt16(param.filter.width)
-    let kernelHeight = UInt16(param.filter.height)
+    let strideX: UInt16;
+    let strideY: UInt16;
     
-    let strideX = UInt16(param.stride[0])
-    let strideY = UInt16(param.stride[1])
-    let paddingX = UInt16(param.paddings[0])
-    let paddingY = UInt16(param.paddings[1])
-    let dilationX = UInt16(param.dilations[0])
-    let dilationY = UInt16(param.dilations[1])
+    let paddingX: UInt16;
+    let paddingY: UInt16;
     
-    metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+    let dilationX: UInt16;
+    let dilationY: UInt16;
+}
 
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
+    var metalParam: MetalConvTransposeParam!
+    required init(device: MTLDevice, param: ConvTransposeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.stride == [2, 2] && param.stride == [2, 2] {
+                super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext)
+            } else {
+                fatalError(" -- conv transpose unsupported yet -- ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.stride == [2, 2] && param.stride == [2, 2] {
+                super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext)
+            } else {
+                fatalError(" -- conv transpose unsupported yet -- ")
+            }
+        } else {
+            fatalError()
+        }
+        
+        //    let filter: [Float32] = param.filter.buffer.array()
+        //    print(" conv transpose filter")
+        //    print(filter)
+        let kernelWidth = UInt16(param.filter.width)
+        let kernelHeight = UInt16(param.filter.height)
+        
+        let strideX = UInt16(param.stride[0])
+        let strideY = UInt16(param.stride[1])
+        let paddingX = UInt16(param.paddings[0])
+        let paddingY = UInt16(param.paddings[1])
+        let dilationX = UInt16(param.dilations[0])
+        let dilationY = UInt16(param.dilations[1])
+        
+        metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+        
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
-    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
index 21108de10e6de4848649a0cbd237ff36243e7be9..2a87d4362f51a237f5d8620116cdc6c17c7d059c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift
@@ -15,59 +15,59 @@
 import Foundation
 
 struct ElementwiseAddMetalParam {
-  var fast: Int32 = 0
-  var axis: Int32 = 0
-  var ylen: Int32 = 0
-  var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
-  var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-  var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
-  var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var fast: Int32 = 0
+    var axis: Int32 = 0
+    var ylen: Int32 = 0
+    var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+    var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+    var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
 }
 
 class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: ElementwiseAddMetalParam
-  required init(device: MTLDevice, param: ElementwiseAddParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    
-    metalParam = ElementwiseAddMetalParam.init()
-    
-    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
-    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
-    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
-    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
-    
-    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
-    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
-    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
-    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
-    if param.axis == -1 {
-      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
-    } else {
-      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    var metalParam: ElementwiseAddMetalParam
+    required init(device: MTLDevice, param: ElementwiseAddParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        
+        metalParam = ElementwiseAddMetalParam.init()
+        
+        let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+        let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+        let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+        let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+        
+        metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+        metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+        metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+        metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+        if param.axis == -1 {
+            metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+        } else {
+            metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+        }
+        metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+        if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+            //      print("===> elementwise_add fast!!!")
+            metalParam.fast = 1
+        }
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
-    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-      //      print("===> elementwise_add fast!!!")
-      metalParam.fast = 1
-    }
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.inputX.metalTexture, index: 0)
+        encoder.setTexture(param.inputY.metalTexture, index: 1)
+        encoder.setTexture(param.output.metalTexture, index: 2)
+        encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.inputX.metalTexture, index: 0)
-    encoder.setTexture(param.inputY.metalTexture, index: 1)
-    encoder.setTexture(param.output.metalTexture, index: 2)
-    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
index a423a119f375641eeadd585360d62787d55a82d4..cf83c2e750aaf8c3c9dfd27c096970b84be217d7 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift
@@ -16,64 +16,64 @@ import Foundation
 
 
 class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: ElementwiseAddMetalParam
-  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-   
-    metalParam = ElementwiseAddMetalParam.init()
-    
-    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
-    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
-    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
-    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
-    
-    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
-    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
-    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
-    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
-    if param.axis == -1 {
-      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
-    } else {
-      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
-    }
-    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
-    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-      //      print("===> elementwise_add fast!!!")
-      metalParam.fast = 1
+    var metalParam: ElementwiseAddMetalParam
+    required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        
+        metalParam = ElementwiseAddMetalParam.init()
+        
+        let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+        let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+        let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+        let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+        
+        metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+        metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+        metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+        metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+        if param.axis == -1 {
+            metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+        } else {
+            metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+        }
+        metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+        if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+            //      print("===> elementwise_add fast!!!")
+            metalParam.fast = 1
+        }
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext)
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
+            }
+        } else {
+            fatalError()
+        }
     }
     
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext)
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
-      }
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.inputX.metalTexture, index: 0)
+        encoder.setTexture(param.inputY.metalTexture, index: 1)
+        encoder.setTexture(param.output.metalTexture, index: 2)
+        encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.inputX.metalTexture, index: 0)
-    encoder.setTexture(param.inputY.metalTexture, index: 1)
-    encoder.setTexture(param.output.metalTexture, index: 2)
-    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
index 7d6e68e699b6a7556915f9ce4136bedae29a6dcc..616fcc1f2d61c4ffba9baa2c7183a5b172fb7b2e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift
@@ -15,47 +15,47 @@
 import Foundation
 
 class FetchKernel<P: PrecisionType>: Kernel, Computable {
-  
-  required init(device: MTLDevice, param: FetchParam<P>, initContext: InitContext) {
-    param.output.initBuffer(device: device)
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.input.transpose == [0, 2, 3, 1] {
-        super.init(device: device, inFunctionName: "fetch_half", initContext: initContext)
-      } else if param.input.transpose == [0, 1, 2, 3] {
-        switch param.input.tensorDim.cout() {
-        case 1, 2:
-          super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext)
-        default:
-          fatalError(" not support ")
+    
+    required init(device: MTLDevice, param: FetchParam<P>, initContext: InitContext) {
+        param.output.initBuffer(device: device)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.input.transpose == [0, 2, 3, 1] {
+                super.init(device: device, inFunctionName: "fetch_half", initContext: initContext)
+            } else if param.input.transpose == [0, 1, 2, 3] {
+                switch param.input.tensorDim.cout() {
+                case 1, 2:
+                    super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext)
+                default:
+                    fatalError(" not support ")
+                }
+            } else {
+                fatalError(" not support ")
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.input.transpose == [0, 2, 3, 1] {
+                super.init(device: device, inFunctionName: "fetch_float", initContext: initContext)
+            } else if param.input.transpose == [0, 1, 2, 3] {
+                switch param.input.tensorDim.cout() {
+                case 1, 2:
+                    super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext)
+                default:
+                    fatalError(" not support ")
+                }
+            } else {
+                fatalError(" not support ")
+            }
+        } else {
+            fatalError(" not support ")
         }
-      } else {
-        fatalError(" not support ")
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.input.transpose == [0, 2, 3, 1] {
-        super.init(device: device, inFunctionName: "fetch_float", initContext: initContext)
-      } else if param.input.transpose == [0, 1, 2, 3] {
-        switch param.input.tensorDim.cout() {
-        case 1, 2:
-          super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext)
-        default:
-          fatalError(" not support ")
-        }
-      } else {
-        fatalError(" not support ")
-      }
-    } else {
-      fatalError(" not support ")
     }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
index 06a6537e1f9612aa646668fab678879b1d782df0..5956806001e8ec3cfb33c6d13095f0e11aa9420d 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift
@@ -15,57 +15,57 @@
 import Foundation
 
 struct FlattenMetalParam {
-  var idim: (Int32, Int32, Int32, Int32)
-  var itrans: (Int32, Int32, Int32, Int32)
-  var odim: (Int32, Int32, Int32, Int32)
-  var otrans: (Int32, Int32, Int32, Int32)
+    var idim: (Int32, Int32, Int32, Int32)
+    var itrans: (Int32, Int32, Int32, Int32)
+    var odim: (Int32, Int32, Int32, Int32)
+    var otrans: (Int32, Int32, Int32, Int32)
 }
 
 
 class FlattenKernel<P: PrecisionType>: Kernel, Computable{
-  
-  var metalParam: FlattenMetalParam
-  
-  required init(device: MTLDevice, param: FlattenParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    var id: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.input.tensorDim.cout() {
-      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
-    }
-    let it: [Int32] = param.input.transpose.map { Int32($0) }
-    var od: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.output.tensorDim.cout() {
-      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
-    }
-    let ot: [Int32] = param.output.transpose.map { Int32($0) }
-    metalParam = FlattenMetalParam.init(
-      idim: (id[0], id[1], id[2], id[3]),
-      itrans: (it[0], it[1], it[2], it[3]),
-      odim: (od[0], od[1], od[2], od[3]),
-      otrans: (ot[0], ot[1], ot[2], ot[3])
-    )
-    let irank = param.input.tensorDim.cout()
-    let orank = param.output.tensorDim.cout()
-    assert(orank == 2)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    
+    var metalParam: FlattenMetalParam
+    
+    required init(device: MTLDevice, param: FlattenParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        var id: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.input.tensorDim.cout() {
+            id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+        }
+        let it: [Int32] = param.input.transpose.map { Int32($0) }
+        var od: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.output.tensorDim.cout() {
+            od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+        }
+        let ot: [Int32] = param.output.transpose.map { Int32($0) }
+        metalParam = FlattenMetalParam.init(
+            idim: (id[0], id[1], id[2], id[3]),
+            itrans: (it[0], it[1], it[2], it[3]),
+            odim: (od[0], od[1], od[2], od[3]),
+            otrans: (ot[0], ot[1], ot[2], ot[3])
+        )
+        let irank = param.input.tensorDim.cout()
+        let orank = param.output.tensorDim.cout()
+        assert(orank == 2)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_2_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_2_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-
-    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
+    func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
index d3fc5a3ac9e62c05d892e26aeca9560943a9e240..4f59bf9971704b26ba360cefac3fc50558db3d0e 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift
@@ -15,41 +15,41 @@
 import Foundation
 
 class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
-  let pipline1: MTLComputePipelineState
-
-  required init(device: MTLDevice, param: MulticlassNMSParam<P>, initContext: InitContext) {
+    let pipline1: MTLComputePipelineState
     
-    param.middleOutput.initBuffer(device: device)
-    param.bboxOutput.initBuffer(device: device)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-      super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-      super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext)
-    } else {
-      fatalError( " unsupport precision " )
+    required init(device: MTLDevice, param: MulticlassNMSParam<P>, initContext: InitContext) {
+        
+        param.middleOutput.initBuffer(device: device)
+        param.bboxOutput.initBuffer(device: device)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
+            super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
+            super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext)
+        } else {
+            fatalError( " unsupport precision " )
+        }
+        
     }
     
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.scores.metalTexture, index: 0)
+        encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
+        encoder.endEncoding()
+        
+        guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
+        encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
+        encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
+        encoderBox.endEncoding()
     }
-    
-    encoder.setTexture(param.scores.metalTexture, index: 0)
-    encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
-    encoder.endEncoding()
-    
-    guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    
-    encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
-    encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
-    encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
-    encoderBox.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
index b6833a4f93e82efbf4ffc28998624ce0b6432b52..37878f26d08ed79e8bfa7c8a1e2e800c8775ff53 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift
@@ -15,57 +15,57 @@
 import Foundation
 
 struct PoolMetalParam {
-  let ksizeX: Int32
-  let ksizeY: Int32
-  let strideX: Int32
-  let strideY: Int32
-  let paddingX: Int32
-  let paddingY: Int32
-  let poolType: Int32
+    let ksizeX: Int32
+    let ksizeY: Int32
+    let strideX: Int32
+    let strideY: Int32
+    let paddingX: Int32
+    let paddingY: Int32
+    let poolType: Int32
 }
 
 class PoolKernel<P: PrecisionType>: Kernel, Computable{
-  var metalParam: PoolMetalParam
-  required init(device: MTLDevice, param: PoolParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    
-    var poolType: Int32
-    switch param.poolType {
-    case "max":
-      poolType = 0
-    case "avg":
-      poolType = 1
-    default:
-      fatalError()
+    var metalParam: PoolMetalParam
+    required init(device: MTLDevice, param: PoolParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        
+        var poolType: Int32
+        switch param.poolType {
+        case "max":
+            poolType = 0
+        case "avg":
+            poolType = 1
+        default:
+            fatalError()
+        }
+        metalParam = PoolMetalParam.init(
+            ksizeX: param.ksize[0],
+            ksizeY: param.ksize[1],
+            strideX: param.stride[0],
+            strideY: param.stride[1],
+            paddingX: param.padding[0],
+            paddingY: param.padding[1],
+            poolType: poolType
+        )
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "pool_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "pool_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    metalParam = PoolMetalParam.init(
-      ksizeX: param.ksize[0],
-      ksizeY: param.ksize[1],
-      strideX: param.stride[0],
-      strideY: param.stride[1],
-      paddingX: param.padding[0],
-      paddingY: param.padding[1],
-      poolType: poolType
-    )
     
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "pool_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "pool_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-
-    encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
index 61a21331a6bf1766a86a5849d7ea9001672642fa..053cb827e3c9d37253b487bddea43ee520aca308 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift
@@ -15,39 +15,39 @@
 import Foundation
 
 class PreluKernel<P: PrecisionType>: Kernel, Computable{
-  required init(device: MTLDevice, param: PreluParam<P>, initContext: InitContext) {
-    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "prelu_element", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prelu_other", initContext: initContext)
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext)
-      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext)
-      }
-    } else {
-      fatalError()
-    }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    required init(device: MTLDevice, param: PreluParam<P>, initContext: InitContext) {
+        param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "prelu_element", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prelu_other", initContext: initContext)
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.mode == "channel" {
+                super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext)
+            } else if param.mode == "element" {
+                super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext)
+            }
+        } else {
+            fatalError()
+        }
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
+    func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
index 15126bbc837f2997e6f693b4d6dbcfc85ba34109..cb8ef81de3176ac7fcf3e3f43a6c023a2e09eb18 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift
@@ -15,136 +15,136 @@
 import Foundation
 
 struct PriorBoxMetalParam {
-  let offset: Float32
-  let stepWidth: Float32
-  let stepHeight: Float32
-  let minSize: Float32
-  let maxSize: Float32
-  let imageWidth: Float32
-  let imageHeight: Float32
-  let clip: Bool
-  let numPriors: uint
-  let aspecRatiosSize: uint
-  let minSizeSize: uint
-  let maxSizeSize: uint
+    let offset: Float32
+    let stepWidth: Float32
+    let stepHeight: Float32
+    let minSize: Float32
+    let maxSize: Float32
+    let imageWidth: Float32
+    let imageHeight: Float32
+    let clip: Bool
+    let numPriors: uint
+    let aspecRatiosSize: uint
+    let minSizeSize: uint
+    let maxSizeSize: uint
 }
 
 class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
-  var metalParam: PriorBoxMetalParam!
-  
-  required init(device: MTLDevice, param: PriorBoxParam<P>, initContext: InitContext) {
-    
-    let originDim = param.output.tensorDim;
-    
-    param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
-    param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
-    
-    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
-    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision)
-    
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.min_max_aspect_ratios_order {
-        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prior_box", initContext: initContext)
-      }
-      
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.min_max_aspect_ratios_order {
-        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext)
-      } else {
-        super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext)
-      }
-    } else {
-      fatalError()
-    }
-    
-    
-    guard param.minSizes.count == 1 else {
-      fatalError(" need implement ")
-    }
-    
-//    let n = 1
-//    let h = param.output.dim[1]
-//    let w = param.output.dim[2]
-//    let c = param.output.dim[3] * param.output.dim[0]
-//
-//    param.output.dim = Dim.init(inDim: [n, h, w, c])
-//    param.output.transpose = [0, 1, 2, 3]
-    
-    let imageWidth = Float32(param.inputImage.padToFourDim[3])
-    let imageHeight = Float32(param.inputImage.padToFourDim[2])
-    
-    let featureWidth = param.input.padToFourDim[3]
-    let featureHeight = param.input.padToFourDim[2]
-    
-    if param.stepW == 0 || param.stepH == 0 {
-      param.stepW = Float32(imageWidth) / Float32(featureWidth)
-      param.stepH = Float32(imageHeight) / Float32(featureHeight)
-    }
-    
-    var outputAspectRatior: [Float32] = []
-    outputAspectRatior.append(1.0)
-    
-    let epsilon = 1e-6
-    for ar in param.aspectRatios {
-      var alreadyExist = false
-      for outputAr in outputAspectRatior {
-        if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
-          alreadyExist = true
-          break
+    var metalParam: PriorBoxMetalParam!
+    
+    required init(device: MTLDevice, param: PriorBoxParam<P>, initContext: InitContext) {
+        
+        let originDim = param.output.tensorDim;
+        
+        param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+        param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+        
+        param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
+        param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision)
+        
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.min_max_aspect_ratios_order {
+                super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prior_box", initContext: initContext)
+            }
+            
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.min_max_aspect_ratios_order {
+                super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext)
+            } else {
+                super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext)
+            }
+        } else {
+            fatalError()
         }
-      }
-      
-      if !alreadyExist {
-        outputAspectRatior.append(ar)
-      }
-      if param.flip {
-        outputAspectRatior.append(1.0 / ar)
-      }
-    }
-    
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
-      float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
-      param.newAspectRatios = buffer
-
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
-      param.newAspectRatios = buffer
-    } else {
-      fatalError()
+        
+        
+        guard param.minSizes.count == 1 else {
+            fatalError(" need implement ")
+        }
+        
+        //    let n = 1
+        //    let h = param.output.dim[1]
+        //    let w = param.output.dim[2]
+        //    let c = param.output.dim[3] * param.output.dim[0]
+        //
+        //    param.output.dim = Dim.init(inDim: [n, h, w, c])
+        //    param.output.transpose = [0, 1, 2, 3]
+        
+        let imageWidth = Float32(param.inputImage.padToFourDim[3])
+        let imageHeight = Float32(param.inputImage.padToFourDim[2])
+        
+        let featureWidth = param.input.padToFourDim[3]
+        let featureHeight = param.input.padToFourDim[2]
+        
+        if param.stepW == 0 || param.stepH == 0 {
+            param.stepW = Float32(imageWidth) / Float32(featureWidth)
+            param.stepH = Float32(imageHeight) / Float32(featureHeight)
+        }
+        
+        var outputAspectRatior: [Float32] = []
+        outputAspectRatior.append(1.0)
+        
+        let epsilon = 1e-6
+        for ar in param.aspectRatios {
+            var alreadyExist = false
+            for outputAr in outputAspectRatior {
+                if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
+                    alreadyExist = true
+                    break
+                }
+            }
+            
+            if !alreadyExist {
+                outputAspectRatior.append(ar)
+            }
+            if param.flip {
+                outputAspectRatior.append(1.0 / ar)
+            }
+        }
+        
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
+            float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
+            param.newAspectRatios = buffer
+            
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
+            param.newAspectRatios = buffer
+        } else {
+            fatalError()
+        }
+        
+        let aspectRatiosSize = uint(outputAspectRatior.count)
+        
+        let maxSizeSize: uint = uint(param.maxSizes.count)
+        let minSizeSize: uint = uint(param.minSizes.count)
+        
+        let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
+        
+        let minSize = param.minSizes.last ?? 0.0
+        let maxSize = param.maxSizes.last ?? 0.0
+        
+        metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
+        
     }
     
-    let aspectRatiosSize = uint(outputAspectRatior.count)
-    
-    let maxSizeSize: uint = uint(param.maxSizes.count)
-    let minSizeSize: uint = uint(param.minSizes.count)
-    
-    let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
-    
-    let minSize = param.minSizes.last ?? 0.0
-    let maxSize = param.maxSizes.last ?? 0.0
-    
-    metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
-    
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setTexture(param.outputVariances.metalTexture, index: 2)
+        
+        encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
+        
+        encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setTexture(param.outputVariances.metalTexture, index: 2)
-    
-    encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
-    
-    encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
-    
-    encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
index 0bde0623ef53dd8346fbc2f91843e06ed01c77d7..06ff7d39902602dbb6bf3639192a12a582ef9550 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift
@@ -15,23 +15,23 @@
 import Foundation
 
 class ReluKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) {
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "relu", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "relu_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) {
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "relu", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "relu_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
index f14db86a3a4904575be6ac1f0c70c36f99ce4305..954eff9a568ec7e7a816aa508d3f0415c5857503 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift
@@ -15,83 +15,83 @@
 import Foundation
 
 struct ReshapeMetalParam {
-  var idim: (Int32, Int32, Int32, Int32)
-  var itrans: (Int32, Int32, Int32, Int32)
-  var odim: (Int32, Int32, Int32, Int32)
-  var otrans: (Int32, Int32, Int32, Int32)
+    var idim: (Int32, Int32, Int32, Int32)
+    var itrans: (Int32, Int32, Int32, Int32)
+    var odim: (Int32, Int32, Int32, Int32)
+    var otrans: (Int32, Int32, Int32, Int32)
 }
 
 struct ReshapeTestParam: TestParam {
-  let inputTexture: MTLTexture
-  let outputTexture: MTLTexture
-  let param: ReshapeMetalParam
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    let param: ReshapeMetalParam
 }
 
 class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
-  
-  var metalParam: ReshapeMetalParam
-  
-  required init(device: MTLDevice, param: ReshapeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    var id: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.input.tensorDim.cout() {
-      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    
+    var metalParam: ReshapeMetalParam
+    
+    required init(device: MTLDevice, param: ReshapeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        var id: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.input.tensorDim.cout() {
+            id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+        }
+        let it: [Int32] = param.input.transpose.map { Int32($0) }
+        var od: [Int32] = [1, 1, 1, 1]
+        for i in 0..<param.output.tensorDim.cout() {
+            od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+        }
+        let ot: [Int32] = param.output.transpose.map { Int32($0) }
+        metalParam = ReshapeMetalParam.init(
+            idim: (id[0], id[1], id[2], id[3]),
+            itrans: (it[0], it[1], it[2], it[3]),
+            odim: (od[0], od[1], od[2], od[3]),
+            otrans: (ot[0], ot[1], ot[2], ot[3])
+        )
+        let irank = param.input.tensorDim.cout()
+        let orank = param.output.tensorDim.cout()
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    let it: [Int32] = param.input.transpose.map { Int32($0) }
-    var od: [Int32] = [1, 1, 1, 1]
-    for i in 0..<param.output.tensorDim.cout() {
-      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    
+    required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) {
+        metalParam = ReshapeMetalParam.init(
+            idim: (0, 0, 0, 0),
+            itrans: (0, 0, 0, 0),
+            odim: (0, 0, 0, 0),
+            otrans: (0, 0, 0, 0)
+        )
+        super.init(device: device, inFunctionName: "reshape", initContext: initContext)
     }
-    let ot: [Int32] = param.output.transpose.map { Int32($0) }
-    metalParam = ReshapeMetalParam.init(
-      idim: (id[0], id[1], id[2], id[3]),
-      itrans: (it[0], it[1], it[2], it[3]),
-      odim: (od[0], od[1], od[2], od[3]),
-      otrans: (ot[0], ot[1], ot[2], ot[3])
-    )
-    let irank = param.input.tensorDim.cout()
-    let orank = param.output.tensorDim.cout()
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        
+        encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-  }
-  
-  required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) {
-    metalParam = ReshapeMetalParam.init(
-    idim: (0, 0, 0, 0),
-    itrans: (0, 0, 0, 0),
-    odim: (0, 0, 0, 0),
-    otrans: (0, 0, 0, 0)
-    )
-    super.init(device: device, inFunctionName: "reshape", initContext: initContext)
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
-    }
-
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-
-    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-//  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
-//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-//      fatalError()
-//    }
-//    encoder.setTexture(testParam.inputTexture, index: 0)
-//    encoder.setTexture(testParam.outputTexture, index: 1)
-//    var pm: ReshapeMetalParam = testParam.param
-//    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-//    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
-//    encoder.endEncoding()
-//  }
+    
+    //  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+    //    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+    //      fatalError()
+    //    }
+    //    encoder.setTexture(testParam.inputTexture, index: 0)
+    //    encoder.setTexture(testParam.outputTexture, index: 1)
+    //    var pm: ReshapeMetalParam = testParam.param
+    //    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    //    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+    //    encoder.endEncoding()
+    //  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
index a007196b6735de29f7de6a8ff28935baf4477a5f..7e9105ae57ed59ceab3ace5c64cdfd97ba029a1f 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift
@@ -15,37 +15,37 @@
 import Foundation
 
 struct ResizeBilinearMetalParam {
-  var ratio_h: Float32
-  var ratio_w: Float32
+    var ratio_h: Float32
+    var ratio_w: Float32
 }
 
 class ResizeBilinearKernel<P: PrecisionType>: Kernel, Computable{
-  required init(device: MTLDevice, param: ResizeBilinearParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext)
-    } else {
-      fatalError()
+    required init(device: MTLDevice, param: ResizeBilinearParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
+        let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
+        var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+        encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
     
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
-    let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
-    var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-
-  
+    
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
index 2afee5607d3c67e9b125c436affbb9afa4ed2c5a..4a6a9a3ee4b2558fbf2a7442717400ef19c22b33 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
@@ -15,14 +15,14 @@
 import Foundation
 
 class ScaleKernel: CusomKernel {
-  init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
-    } else {
-      fatalError(" unsupport ")
+    init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
+        } else {
+            fatalError(" unsupport ")
+        }
     }
-  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
index dfec8f9adf9dfc6be3a835c8ea215b37cb1a948c..1d2b80cae412ce34415f6c678f0912bbf29e1bb2 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift
@@ -19,24 +19,24 @@ struct ShapeMetalParam {
 }
 
 class ShapeKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
-//    print("shape compute")
-//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-//      throw PaddleMobileError.predictError(message: " encode is nil")
-//    }
-//    encoder.setTexture(param.output.metalTexture, index: 0)
-//    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: ShapeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "shape", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "shape_half", initContext: initContext)
-    } else {
-      fatalError()
+    func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
+        //    print("shape compute")
+        //    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+        //      throw PaddleMobileError.predictError(message: " encode is nil")
+        //    }
+        //    encoder.setTexture(param.output.metalTexture, index: 0)
+        //    encoder.endEncoding()
     }
-  }
-  
+    
+    required init(device: MTLDevice, param: ShapeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "shape", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "shape_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
+    }
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
index 1eac43484d8759e2d1aefaef4b55fbde728a24d6..b4f3281425474eba90cbb3ac6e262cf06f3eeb2d 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift
@@ -15,37 +15,37 @@
 import Foundation
 
 struct SoftmaxMetalParam {
-  let N: Int32
-  let K: Int32
+    let N: Int32
+    let K: Int32
 }
 
 class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
-  
-  var metalParam: SoftmaxMetalParam
-  required init(device: MTLDevice, param: SoftmaxParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    metalParam = SoftmaxMetalParam.init(
-      N: Int32(param.input.tensorDim[0]),
-      K: Int32(param.input.tensorDim[1])
-    )
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "softmax_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "softmax_half", initContext: initContext)
-    } else {
-      fatalError()
+    
+    var metalParam: SoftmaxMetalParam
+    required init(device: MTLDevice, param: SoftmaxParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        metalParam = SoftmaxMetalParam.init(
+            N: Int32(param.input.tensorDim[0]),
+            K: Int32(param.input.tensorDim[1])
+        )
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "softmax_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "softmax_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-  }
-
-  func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encoder is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
index 8b07a87406a6c33767ac6552c0f8241602a89cb0..d15e37296290980a3a7092b7e52a82d33e1044a9 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift
@@ -15,79 +15,79 @@
 import Foundation
 
 struct SplitMetalParam {
-  var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
-  var axis: Int32 = 0
-  var offset: Int32 = 0
-  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-  var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+    var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+    var axis: Int32 = 0
+    var offset: Int32 = 0
+    var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
 }
 
 class SplitKernel<P: PrecisionType>: Kernel, Computable{
-  var smp: SplitMetalParam
-  func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    var smp: SplitMetalParam
+    func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        for i in 0..<param.outputList.count {
+            encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+        }
+        encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+        encoder.endEncoding()
     }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    for i in 0..<param.outputList.count {
-      encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+    
+    required init(device: MTLDevice, param: SplitParam<P>, initContext: InitContext) {
+        //     param.output.initTexture(device: device, computePrecision: computePrecision)
+        let num = param.outputList.count
+        let rank = param.input.tensorDim.cout()
+        assert(num >= 2 && num <= 4)
+        for output in param.outputList {
+            output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+        }
+        smp = SplitMetalParam.init()
+        smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
+        smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
+        for i in 0..<4 {
+            if param.input.transpose[i] == smp.axis {
+                smp.axis = Int32(i)
+                break
+            }
+        }
+        smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
+        var vdim: [Int32] = [0, 0, 0, 0]
+        for i in 0..<num {
+            vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
+        }
+        smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
+        var v = "normal"
+        if rank == 4 {
+            if smp.axis == 1 {
+                v = "y"
+            } else if smp.axis == 2 {
+                v = "x"
+            }
+        } else if rank == 3 {
+            if smp.axis == 2 {
+                v = "y"
+            } else if smp.axis == 3 {
+                v = "x"
+            }
+        } else if rank == 2 {
+            if smp.axis == 2 {
+                v = "y"
+            }
+        }
+        if v == "normal" {
+            fatalError("split unsupported")
+        }
+        if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half", initContext: initContext)
+        } else {
+            fatalError()
+        }
     }
-    encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: SplitParam<P>, initContext: InitContext) {
-    //     param.output.initTexture(device: device, computePrecision: computePrecision)
-    let num = param.outputList.count
-    let rank = param.input.tensorDim.cout()
-    assert(num >= 2 && num <= 4)
-    for output in param.outputList {
-      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    }
-    smp = SplitMetalParam.init()
-    smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
-    smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
-    for i in 0..<4 {
-      if param.input.transpose[i] == smp.axis {
-        smp.axis = Int32(i)
-        break
-      }
-    }
-    smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
-    var vdim: [Int32] = [0, 0, 0, 0]
-    for i in 0..<num {
-      vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
-    }
-    smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
-    var v = "normal"
-    if rank == 4 {
-      if smp.axis == 1 {
-        v = "y"
-      } else if smp.axis == 2 {
-        v = "x"
-      }
-    } else if rank == 3 {
-      if smp.axis == 2 {
-        v = "y"
-      } else if smp.axis == 3 {
-        v = "x"
-      }
-    } else if rank == 2 {
-      if smp.axis == 2 {
-        v = "y"
-      }
-    }
-    if v == "normal" {
-      fatalError("split unsupported")
-    }
-    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half", initContext: initContext)
-    } else {
-      fatalError()
-    }
-  }
-  
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
index fd3ba24776ff02cd79d9d9c825bc211b2877bbf0..58b3db8d8666fd4321eb5554541dca66cf5b6397 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -17,31 +17,31 @@ import MetalKit
 import CoreMedia
 
 struct Texture2DTo2DArrayParam {
-  let input: MTLTexture
-  let output: MTLTexture
-  let expectDim: Dim
+    let input: MTLTexture
+    let output: MTLTexture
+    let expectDim: Dim
 }
 
 class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
-  func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    encoder.setTexture(param.input.mtlTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
-    encoder.endEncoding()
-  }
-  
-  required init(device: MTLDevice, param: FeedParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext)
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext)
-    } else {
-      fatalError()
+    func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.mtlTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
+        encoder.endEncoding()
     }
     
-  }
+    required init(device: MTLDevice, param: FeedParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext)
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext)
+        } else {
+            fatalError()
+        }
+        
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
index e1490052e7419591c57ad8cdf628708fd15beeb8..92947dc27845986c43d099f58c147c1f4c86a4ef 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift
@@ -15,65 +15,65 @@
 import Foundation
 
 struct TransposeMetalParam {
-  var iC: Int32 = 0
-  var oC: Int32 = 0
-  var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+    var iC: Int32 = 0
+    var oC: Int32 = 0
+    var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
 }
 
 class TransposeKernel<P: PrecisionType>: Kernel, Computable {
-  var metalParam: TransposeMetalParam = TransposeMetalParam.init()
-  required init(device: MTLDevice, param: TransposeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    let rank = param.input.tensorDim.cout()
-    var axis: [Int] = [0, 1, 2, 3]
-    for i in 0..<param.axis.count {
-      axis[4-rank+i] = 4 - rank + Int(param.axis[i])
-    }
-
-    var naxis: [Int] = [0, 0, 0, 0]
-    for i in 0..<4 {
-      for j in 0..<4 {
-        if param.input.transpose[j] == axis[i] {
-          naxis[i] = j
-          break
+    var metalParam: TransposeMetalParam = TransposeMetalParam.init()
+    required init(device: MTLDevice, param: TransposeParam<P>, initContext: InitContext) {
+        param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
+        let rank = param.input.tensorDim.cout()
+        var axis: [Int] = [0, 1, 2, 3]
+        for i in 0..<param.axis.count {
+            axis[4-rank+i] = 4 - rank + Int(param.axis[i])
         }
-      }
-    }
-    metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
-    metalParam.oC = Int32(param.output.dim[3])
-    metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
-    var kernelFunc = "transpose_undefined"
-    if GlobalConfig.shared.computePrecision == .Float16 {
-      if param.input.transpose == axis {
-        kernelFunc = "transpose_copy_half"
-      } else {
-        kernelFunc = "transpose_\(rank)_half"
-      }
-    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      if param.input.transpose == axis {
-        kernelFunc = "transpose_copy_float"
-      } else {
-        kernelFunc = "transpose_\(rank)_float"
-      }
-    } else {
-      fatalError()
+        
+        var naxis: [Int] = [0, 0, 0, 0]
+        for i in 0..<4 {
+            for j in 0..<4 {
+                if param.input.transpose[j] == axis[i] {
+                    naxis[i] = j
+                    break
+                }
+            }
+        }
+        metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
+        metalParam.oC = Int32(param.output.dim[3])
+        metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
+        var kernelFunc = "transpose_undefined"
+        if GlobalConfig.shared.computePrecision == .Float16 {
+            if param.input.transpose == axis {
+                kernelFunc = "transpose_copy_half"
+            } else {
+                kernelFunc = "transpose_\(rank)_half"
+            }
+        } else if GlobalConfig.shared.computePrecision == .Float32 {
+            if param.input.transpose == axis {
+                kernelFunc = "transpose_copy_float"
+            } else {
+                kernelFunc = "transpose_\(rank)_float"
+            }
+        } else {
+            fatalError()
+        }
+        print("===========>", kernelFunc)
+        print(metalParam)
+        super.init(device: device, inFunctionName: kernelFunc, initContext: initContext)
     }
-    print("===========>", kernelFunc)
-    print(metalParam)
-    super.init(device: device, inFunctionName: kernelFunc, initContext: initContext)
-  }
-  
-  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
-  
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setTexture(param.output.metalTexture, index: 1)
-    encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
-  }
-  
-
+    
+    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
index 6d2e46b64986300556898596ea881a254709f472..b438b3c46cf061d48837646d1678c150223c4673 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift
@@ -15,57 +15,57 @@
 import Foundation
 
 class MulticlassNMSParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
-      bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
-      output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      
-      middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim)
-      
-      bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
+            bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
+            output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            
+            middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim)
+            
+            bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  var bboxOutput: FetchHolder
-  var middleOutput: FetchHolder
-  let scores: Texture
-  let bboxes: Texture
-  var output: Texture
+    var bboxOutput: FetchHolder
+    var middleOutput: FetchHolder
+    let scores: Texture
+    let bboxes: Texture
+    var output: Texture
 }
 
 class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
-
-  func inputVariant() -> [String : [MTLBuffer]] {
-    guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else {
-      fatalError()
+    
+    func inputVariant() -> [String : [MTLBuffer]] {
+        guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else {
+            fatalError()
+        }
+        return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]]
     }
-    return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]]
-  }
-  
-  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let _ {
-      fatalError()
+    
+    func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let _ {
+            fatalError()
+        }
+    }
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
+    }
+    
+    typealias OpType =  MulticlassNMSOp<P>
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        
+    }
+    
+    func delogOutput() {
+        print(" nms - output: ")
+        print(para.bboxes.metalTexture.float32Array().strideArray())
     }
-  }
-  
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  typealias OpType =  MulticlassNMSOp<P>
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-
-  }
-  
-  func delogOutput() {
-    print(" nms - output: ")
-    print(para.bboxes.metalTexture.float32Array().strideArray())
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
index e57c8f48e362af8cae8fedbb5a0292775f0ce923..8b212f3b1d0ea616881eef3f35dc4bc1eb2f02f2 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift
@@ -15,60 +15,60 @@
 import Foundation
 
 class PoolParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
-      ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
-      stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
-      padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-      ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
-      globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
-      assert(input.transpose == [0, 2, 3, 1])
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
+            ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
+            stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
+            globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
+            assert(input.transpose == [0, 2, 3, 1])
+        } catch let error {
+            throw error
+        }
+        //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
     }
-    //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
-  }
-  let input: Texture
-  var output: Texture
-  var ksize: [Int32]
-  var stride: [Int32]
-  var padding: [Int32]
-  var poolType: String
-  var ceilMode: Bool
-  var globalPooling: Bool
+    let input: Texture
+    var output: Texture
+    var ksize: [Int32]
+    var stride: [Int32]
+    var padding: [Int32]
+    var poolType: String
+    var ceilMode: Bool
+    var globalPooling: Bool
 }
 
 class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = PoolOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = PoolOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-
     
-//    print("pool2d delog")
-//    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
-//    print(para.ksize)
-//    print(para.stride)
-//    print(para.padding)
-//    print(para.poolType)
-//    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
-  }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        
+        
+        //    print("pool2d delog")
+        //    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
+        //    print(para.ksize)
+        //    print(para.stride)
+        //    print(para.padding)
+        //    print(para.poolType)
+        //    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
index b7150c2fea85b7a6da6ae883e95c751484db6af6..09a6b027e38b640e355357f244f0e6150e0d95d5 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift
@@ -15,51 +15,51 @@
 import Foundation
 
 class PreluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
-      mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+            mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let mode: String
-  let alpha: Tensor<P>
-  let input: Texture
-  var output: Texture
+    let mode: String
+    let alpha: Tensor<P>
+    let input: Texture
+    var output: Texture
 }
 
 class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = PreluOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = PreluOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) input: ")
-    print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
     
-    print(" \(type) Alpha: ")
-    let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) input: ")
+        print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
+        
+        print(" \(type) Alpha: ")
+        let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
+        
+        print(" \(type) output: ")
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+    }
     
-    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
-  }
-  
-//    print("softmax delog")
-//    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
-//    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
+    //    print("softmax delog")
+    //    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
+    //    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
index bff7c9870a3dc70e820b02ad775ca8a19527c26d..80774f22a9fa27fa97165636b3792785d3146add 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift
@@ -15,109 +15,109 @@
 import Foundation
 
 class PriorBoxParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
-    } catch _ {
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
+        } catch _ {
+        }
+        
+        do {
+            input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
+            inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
+            outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
+            minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
+            maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
+            aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
+            variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
+            flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
+            clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
+            stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
+            stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
+            offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
     
-    do {
-      input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
-      inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
-      outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
-      minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
-      maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
-      aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
-      variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
-      flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
-      clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
-      stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
-      stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
-      offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
-    }
-  }
-  
-  var min_max_aspect_ratios_order: Bool = false
-  let minSizes: [Float32]
-  let maxSizes: [Float32]
-  let aspectRatios: [Float32]
-  var newAspectRatios: MTLBuffer?
-  let variances: [Float32]
-  let flip: Bool
-  let clip: Bool
-  var stepW: Float32
-  var stepH: Float32
-  let offset: Float32
-  
-  let input: Texture
-  let inputImage: Texture
-  var output: Texture
-  let outputVariances: Texture
+    var min_max_aspect_ratios_order: Bool = false
+    let minSizes: [Float32]
+    let maxSizes: [Float32]
+    let aspectRatios: [Float32]
+    var newAspectRatios: MTLBuffer?
+    let variances: [Float32]
+    let flip: Bool
+    let clip: Bool
+    var stepW: Float32
+    var stepH: Float32
+    let offset: Float32
+    
+    let input: Texture
+    let inputImage: Texture
+    var output: Texture
+    let outputVariances: Texture
 }
 
 class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = PriorBoxOp<P>
-
-  func inferShape() {
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
-    }
-  }
-  
-  func delogOutput() {
-
-    print(" \(type) output: ")
-    // output
-//    let outputArray = para.output.metalTexture.float32Array()
-//    print(outputArray.strideArray())
-//    let device = para.input.metalTexture!.device
-//    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
-//    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
-//    print("boxes: ")
-//    print(boxes.strideArray())
-//    print("variances: ")
-//    print(variances.strideArray())
-    // output
-    print(" \(type) output: ")
     
-    let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3]))
-    print(" dim: \(para.output.dim)")
-    print(box.strideArray())
-//    print((0..<box.count).map { (index: $0, value: box[$0])})
-//    print(para.output.realNHWC().strideArray())
+    typealias OpType = PriorBoxOp<P>
     
-//    let padToFourDim = para.output.padToFourDim
-//    if para.output.transpose == [0, 1, 2, 3] {
-//      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
-//      print(outputArray.strideArray())
-//    } else if para.output.transpose == [0, 2, 3, 1] {
-//      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
-//    } else {
-//      print(" not implement")
-//    }
-    
-//    writeToLibrary(fileName: "box_out", array: outputArray)
-    
-    // output variance
-//    let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in
-//      return o
-//    }
-//
-//    print(" output variance: \(outputVarianceArray)")
+    func inferShape() {
+    }
     
-//    writeToLibrary(fileName: "variance_out", array: outputVarianceArray)
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
     
-  }
+    func delogOutput() {
+        
+        print(" \(type) output: ")
+        // output
+        //    let outputArray = para.output.metalTexture.float32Array()
+        //    print(outputArray.strideArray())
+        //    let device = para.input.metalTexture!.device
+        //    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
+        //    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
+        //    print("boxes: ")
+        //    print(boxes.strideArray())
+        //    print("variances: ")
+        //    print(variances.strideArray())
+        // output
+        print(" \(type) output: ")
+        
+        let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3]))
+        print(" dim: \(para.output.dim)")
+        print(box.strideArray())
+        //    print((0..<box.count).map { (index: $0, value: box[$0])})
+        //    print(para.output.realNHWC().strideArray())
+        
+        //    let padToFourDim = para.output.padToFourDim
+        //    if para.output.transpose == [0, 1, 2, 3] {
+        //      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
+        //      print(outputArray.strideArray())
+        //    } else if para.output.transpose == [0, 2, 3, 1] {
+        //      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
+        //    } else {
+        //      print(" not implement")
+        //    }
+        
+        //    writeToLibrary(fileName: "box_out", array: outputArray)
+        
+        // output variance
+        //    let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in
+        //      return o
+        //    }
+        //
+        //    print(" output variance: \(outputVarianceArray)")
+        
+        //    writeToLibrary(fileName: "variance_out", array: outputVarianceArray)
+        
+    }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
index ef109081061c601fb17a23e943dcd01af618b724..a286114b3ff02365ce5ce51a841d85955996110b 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift
@@ -16,44 +16,44 @@
 import Foundation
 
 class ReluParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
+    let input: Texture
+    var output: Texture
 }
 
 class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ReluOp<P>
-  
-  func inferShape() {
-    para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ReluOp<P>
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    print(para.output.metalTexture)
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-//    let device = para.output.metalTexture!.device
-//    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-//    print(outputArray.strideArray())
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        print(para.output.metalTexture)
+        print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+        //    let device = para.output.metalTexture!.device
+        //    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        //    print(outputArray.strideArray())
+    }
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
index e40eae02d0c11c0bd372514466b28cef27dea96b..417344f1da60f160b7727872355652821e4cdb61 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift
@@ -16,63 +16,63 @@ import Foundation
 import Metal
 
 class ReshapeParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
-        
-      var s: [Int] = shape.map { Int($0) }
-      
-      var di = -1
-      var ml = 1
-      for i in 0..<s.count {
-        if s[i] == -1 {
-          di = i
-          continue
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
+            
+            var s: [Int] = shape.map { Int($0) }
+            
+            var di = -1
+            var ml = 1
+            for i in 0..<s.count {
+                if s[i] == -1 {
+                    di = i
+                    continue
+                }
+                ml *= s[i]
+            }
+            if di >= 0 {
+                s[di] = input.dim.numel() / ml
+            }
+            output.tensorDim = Dim.init(inDim: s)
+            var dim: [Int] = [1, 1, 1, 1]
+            for i in 0..<s.count {
+                dim[4-s.count+i] = s[i]
+            }
+            output.padToFourDim = Dim.init(inDim: dim)
+            output.dim = output.padToFourDim
+        } catch let error {
+            throw error
         }
-        ml *= s[i]
-      }
-      if di >= 0 {
-        s[di] = input.dim.numel() / ml
-      }
-      output.tensorDim = Dim.init(inDim: s)
-      var dim: [Int] = [1, 1, 1, 1]
-      for i in 0..<s.count {
-        dim[4-s.count+i] = s[i]
-      }
-      output.padToFourDim = Dim.init(inDim: dim)
-      output.dim = output.padToFourDim
-    } catch let error {
-      throw error
     }
-  }
-  let input: Texture
-  let shape: [Int32]
-  var output: Texture
+    let input: Texture
+    let shape: [Int32]
+    var output: Texture
 }
 
 class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ReshapeOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ReshapeOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    func delogOutput() {
+        print("reshape delog")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
+        //    print(outputArray)
     }
-  }
-  func delogOutput() {
-    print("reshape delog")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-//    print(outputArray)
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
index 980bb734a796c067012855f8a0d0c4ccef33afdb..e71a62b682bd40bdb3f910aafb2d11f2fe439df0 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift
@@ -15,50 +15,44 @@
 import Foundation
 
 class ResizeBilinearParam<P: PrecisionType>: OpParam {
-  typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
-//      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
-//        fatalError()
-//      }
-      output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
-      out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
+            //      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+            //        fatalError()
+            //      }
+            output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+            out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let out_h: Int32
-  let out_w: Int32
+    let input: Texture
+    var output: Texture
+    let out_h: Int32
+    let out_w: Int32
 }
 
 class ResizeBilinearOp<P: PrecisionType>: Operator<ResizeBilinearKernel<P>, ResizeBilinearParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ResizeBilinearOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ResizeBilinearOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-  }
-  
+    
 }
-
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
index c13c3864e4f73bdad1b83e19ca9f66051eea266d..fd358a67aee6e9403d7d8cb5ca3d1c11e5e1f5cb 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift
@@ -15,39 +15,39 @@
 import Foundation
 
 class ShapeParam<P: PrecisionType>: OpParam {
- // typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
-      output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-    } catch let error {
-      throw error
+    // typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  var output: Texture
-  let input: Texture
+    var output: Texture
+    let input: Texture
 }
 
 class ShapeOp<P: PrecisionType>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = ShapeOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = ShapeOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-  }
-  
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+    }
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
index 2b2455eaa60142f890c7ee5e14244c77854a0ccd..f13bf201956f04ee0a41124bf8556c1d6f31e410 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift
@@ -16,48 +16,48 @@ import Foundation
 import Metal
 
 class SoftmaxParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      
-      //assert(input.tensorDim.dims.count == 2)
-      //assert(input.transpose == [0, 1, 2, 3])
-      
-      output.dim = input.dim
-      output.tensorDim = input.tensorDim
-      output.padToFourDim = input.padToFourDim
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            
+            //assert(input.tensorDim.dims.count == 2)
+            //assert(input.transpose == [0, 1, 2, 3])
+            
+            output.dim = input.dim
+            output.tensorDim = input.tensorDim
+            output.padToFourDim = input.padToFourDim
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
+    let input: Texture
+    var output: Texture
 }
 
 class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, Runable, Creator, InferShaperable{
-  typealias OpType = SoftmaxOp<P>
-
-  func inferShape() {
-    // para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    typealias OpType = SoftmaxOp<P>
+    
+    func inferShape() {
+        // para.output.dim = para.input.dim
     }
-  }
-  
-  func delogOutput() {
-    print("softmax delog")
-    print(para.input)
     
-    print(para.output)
-    let padToFourDim = para.output.padToFourDim
-    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
-    print(outputArray.strideArray())
-  }
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print("softmax delog")
+        print(para.input)
+        
+        print(para.output)
+        let padToFourDim = para.output.padToFourDim
+        let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+        print(outputArray.strideArray())
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
index 4d9933f39275d522cec71ca08a591182433d7bae..4d5cb9b0beff3a657c5a72fe00ef6f9e9140cc58 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift
@@ -15,63 +15,63 @@
 import Foundation
 
 class SplitParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = Texture.init(device: input.metalTexture!.device, inDim: input.dim)
-      axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
-      sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
-      if axis < 0 {
-        axis = input.tensorDim.cout() + axis
-      }
-      guard let outlist = opDesc.outputs["Out"] else {
-        fatalError()
-      }
-      for out in outlist {
-        guard let variant = inScope[out], let v = variant as? Texture else {
-          fatalError()
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = Texture.init(device: input.metalTexture!.device, inDim: input.dim)
+            axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
+            if axis < 0 {
+                axis = input.tensorDim.cout() + axis
+            }
+            guard let outlist = opDesc.outputs["Out"] else {
+                fatalError()
+            }
+            for out in outlist {
+                guard let variant = inScope[out], let v = variant as? Texture else {
+                    fatalError()
+                }
+                outputList.append(v)
+                sections.append(Int32(v.tensorDim.dims[axis]))
+            }
+        } catch let error {
+            throw error
         }
-        outputList.append(v)
-        sections.append(Int32(v.tensorDim.dims[axis]))
-      }
-    } catch let error {
-      throw error
     }
-  }
-  
-  var axis: Int
-  let input: Texture
-  var output: Texture
-  var outputList: [Texture] = []
-  var sections: [Int32] = []
+    
+    var axis: Int
+    let input: Texture
+    var output: Texture
+    var outputList: [Texture] = []
+    var sections: [Int32] = []
 }
 
 class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = SplitOp<P>
-
-  func inferShape() {
-    //        para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = SplitOp<P>
+    
+    func inferShape() {
+        //        para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.input.metalTexture!.device
-    for out in para.outputList {
-      let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
-      print(arr.strideArray())
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.input.metalTexture!.device
+        for out in para.outputList {
+            let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
+            print(arr.strideArray())
+        }
     }
-  }
-  
+    
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
index 064955fcac20937ae3ac8a12f51ef52ab5a00ba9..c05c08066797d8d5c7855d8294ad7f703e85e582 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift
@@ -16,43 +16,43 @@ import Foundation
 import Metal
 
 class TransposeParam<P: PrecisionType>: OpParam {
-  //typealias ParamPrecisionType = P
-  required init(opDesc: PMOpDesc, inScope: Scope) throws {
-    do {
-      input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
-    } catch let error {
-      throw error
+    //typealias ParamPrecisionType = P
+    required init(opDesc: PMOpDesc, inScope: Scope) throws {
+        do {
+            input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
+        } catch let error {
+            throw error
+        }
     }
-  }
-  let input: Texture
-  var output: Texture
-  let axis: [Int32]
+    let input: Texture
+    var output: Texture
+    let axis: [Int32]
 }
 
 class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{
-  
-  typealias OpType = TransposeOp<P>
-
-  func inferShape() {
-    //para.output.dim = para.input.dim
-  }
-  
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
+    
+    typealias OpType = TransposeOp<P>
+    
+    func inferShape() {
+        //para.output.dim = para.input.dim
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print(" \(type) output: ")
+        let device = para.output.metalTexture!.device
+        let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+        print(outputArray.strideArray())
     }
-  }
-  
-  func delogOutput() {
-    print(" \(type) output: ")
-    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
-  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
index b021b09008b1f3bef3ba01d5a51fe7b7803fedaa..27ed620c24dcbc2f4423debe8b14c4436d0b6dda 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift
@@ -45,13 +45,13 @@ public class PMBlockDesc {
 }
 
 extension PMBlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
-  public var description: String {
+    public var description: String {
         var str = ""
         
         for i in 0..<ops.count {
-          str += " op \(i): "
-          let op = ops[i]
-          str += op.description
+            str += " op \(i): "
+            let op = ops[i]
+            str += op.description
         }
         
         for varDesc in vars {
@@ -61,7 +61,7 @@ extension PMBlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
         return str
     }
     
-  public var debugDescription: String {
+    public var debugDescription: String {
         return description
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
index 663677150eb0f0240b032a713424aac8ed66c86a..51a9e6be2fbab46917c828c5c50cc1f0d03eb6f7 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMOpDesc.swift
@@ -15,12 +15,12 @@
 import Foundation
 
 class PMOpDesc {
-  let inputs: [String : [String]]
-  var paraInputs: [String : [String]]
-  var outputs: [String : [String]]
-  let unusedOutputs: [String : [String]]
-  var attrs: [String : Attr] = [:]
-  var type: String
+    let inputs: [String : [String]]
+    var paraInputs: [String : [String]]
+    var outputs: [String : [String]]
+    let unusedOutputs: [String : [String]]
+    var attrs: [String : Attr] = [:]
+    var type: String
     init(protoOpDesc: OpDesc) {
         type = protoOpDesc.type
         let creator = { (vars: [OpDesc_Var], canAdd: (String) -> Bool) -> [String : [String]] in
@@ -58,24 +58,24 @@ class PMOpDesc {
 }
 
 extension PMOpDesc: CustomStringConvertible, CustomDebugStringConvertible {
-  var description: String {
-    var str = ""
-    str += "op type: \(type): \n"
-    str += "    op inputs: \n"
-    str += "        \(inputs) \n"
-    str += "    op para inputs: \n"
-    str += "        \(paraInputs) \n"
-    str += "    op para outputs: \n"
-    str += "        \(outputs) \n"
-    str += "    op attrs: \n"
-    str += "        \(attrs) \n"
+    var description: String {
+        var str = ""
+        str += "op type: \(type): \n"
+        str += "    op inputs: \n"
+        str += "        \(inputs) \n"
+        str += "    op para inputs: \n"
+        str += "        \(paraInputs) \n"
+        str += "    op para outputs: \n"
+        str += "        \(outputs) \n"
+        str += "    op attrs: \n"
+        str += "        \(attrs) \n"
+        
+        return str
+    }
+    
+    var debugDescription: String {
+        return description
+    }
+    
     
-    return str
-  }
-  
-  var debugDescription: String {
-    return description
-  }
-  
-  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
index 130e6f49fb61b1488538849e86ff793b53f31a86..e97f448e294c1187a12b4e6bf1139e0425de26b3 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift
@@ -79,7 +79,7 @@ public class PMVarDesc {
 }
 
 extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible {
-  public var description: String {
+    public var description: String {
         var str = ""
         str += "var name \(name): \n"
         if let inTensorDesc = tensorDesc {
@@ -93,7 +93,7 @@ extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible {
         return str
     }
     
-  public var debugDescription: String {
+    public var debugDescription: String {
         return description
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
index dcb065de3d8c6e7ec6cf437cbc2a19305def08ae..e4248b64098bd7939f09577566021c1756c05085 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift
@@ -15,286 +15,286 @@
 import Foundation
 
 precedencegroup ChainNode {
-  associativity: left
-  higherThan: MultiplicationPrecedence
+    associativity: left
+    higherThan: MultiplicationPrecedence
 }
 
 infix operator --> : ChainNode
 
 class Node {
-  var inputs: [Node] = []
-  var outputs: [Node] = []
-  var type: String
-  var opDesc: PMOpDesc?
-  init(inOpDesc: PMOpDesc) {
-    type = inOpDesc.type
-    opDesc = inOpDesc
-  }
-  
-  init(inType: String) {
-    type = inType
-  }
-  
-  subscript(index: Int) -> [Node] {
-    var nodes: [Node] = []
-    getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
-    return nodes
-  }
-  
-  func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
-    if index == nowIndex {
-      nodes.append(self)
+    var inputs: [Node] = []
+    var outputs: [Node] = []
+    var type: String
+    var opDesc: PMOpDesc?
+    init(inOpDesc: PMOpDesc) {
+        type = inOpDesc.type
+        opDesc = inOpDesc
     }
     
-    for output in outputs {
-      output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+    init(inType: String) {
+        type = inType
     }
-  }
-  
-  static func -->(lNode: Node, rNode: Node) -> Node {
-    lNode.outputs.append(rNode)
-    rNode.inputs.append(lNode)
-    return rNode
-  }
-  
-  func depth(begin: UInt = 1) -> UInt {
-    var beginMax: UInt = 1
-    for output in outputs {
-      let subDepth = output.depth(begin: begin + 1)
-      beginMax = max(begin, subDepth)
-    }
-    beginMax = max(begin, beginMax)
-    return beginMax
-  }
-  
-  func to(depth: UInt) -> Node {
-    let beginNode = Node.init(inType: type)
-    beginNode.opDesc = opDesc
-    to(depth: depth - 1, withNode: beginNode)
-    return beginNode
-  }
-  
-  func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
-    let fusionNode = fusion.fusionNode()
-    let change = fusion.change()
-    let inOutputs = outputs
-    outputs.removeAll()
-    opDesc?.outputs.removeAll()
-    for i in 0..<inOutputs.count {
-      inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+    
+    subscript(index: Int) -> [Node] {
+        var nodes: [Node] = []
+        getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
+        return nodes
     }
-    opDesc?.type = fusion.fusionType()
-    type = fusion.fusionType()
-  }
-  
-  private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
-    guard let inOpdesc = opDesc else {
-      fatalError()
+    
+    func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
+        if index == nowIndex {
+            nodes.append(self)
+        }
+        
+        for output in outputs {
+            output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+        }
     }
     
-    for attr in inOpdesc.attrs {
-      beginNode.opDesc?.attrs[attr.key] = attr.value
-      //            print(beginNode.opDesc?.attrs)
+    static func -->(lNode: Node, rNode: Node) -> Node {
+        lNode.outputs.append(rNode)
+        rNode.inputs.append(lNode)
+        return rNode
     }
     
-    for paraInput in inOpdesc.paraInputs {
-      if let inChanges = change[type] {
-        for keyChange in inChanges {
-          if keyChange.from == paraInput.key {
-            beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
-          } else {
-            beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-          }
+    func depth(begin: UInt = 1) -> UInt {
+        var beginMax: UInt = 1
+        for output in outputs {
+            let subDepth = output.depth(begin: begin + 1)
+            beginMax = max(begin, subDepth)
         }
-      } else {
-        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-      }
+        beginMax = max(begin, beginMax)
+        return beginMax
     }
     
-    if matchNode.outputs.count == 0 {
-      beginNode.outputs.append(contentsOf: outputs)
-      beginNode.opDesc?.outputs = inOpdesc.outputs
-      
+    func to(depth: UInt) -> Node {
+        let beginNode = Node.init(inType: type)
+        beginNode.opDesc = opDesc
+        to(depth: depth - 1, withNode: beginNode)
+        return beginNode
     }
-    removedNodes.append(self)
     
-    for i in 0..<matchNode.outputs.count {
-      outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
+    func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
+        let fusionNode = fusion.fusionNode()
+        let change = fusion.change()
+        let inOutputs = outputs
+        outputs.removeAll()
+        opDesc?.outputs.removeAll()
+        for i in 0..<inOutputs.count {
+            inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+        }
+        opDesc?.type = fusion.fusionType()
+        type = fusion.fusionType()
     }
     
-  }
-  
-  private func to(depth: UInt, withNode: Node) {
-    if depth < 1 {
-      return
+    private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
+        guard let inOpdesc = opDesc else {
+            fatalError()
+        }
+        
+        for attr in inOpdesc.attrs {
+            beginNode.opDesc?.attrs[attr.key] = attr.value
+            //            print(beginNode.opDesc?.attrs)
+        }
+        
+        for paraInput in inOpdesc.paraInputs {
+            if let inChanges = change[type] {
+                for keyChange in inChanges {
+                    if keyChange.from == paraInput.key {
+                        beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
+                    } else {
+                        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+                    }
+                }
+            } else {
+                beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+            }
+        }
+        
+        if matchNode.outputs.count == 0 {
+            beginNode.outputs.append(contentsOf: outputs)
+            beginNode.opDesc?.outputs = inOpdesc.outputs
+            
+        }
+        removedNodes.append(self)
+        
+        for i in 0..<matchNode.outputs.count {
+            outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
+        }
+        
     }
     
-    for output in outputs {
-      let node = Node.init(inType: output.type)
-      node.opDesc = output.opDesc
-      withNode.outputs.append(node)
-      output.to(depth: depth - 1, withNode: node)
-    }
-  }
-  
-  func relationship() -> [String : Node]{
-    var map: [String : Node] = [:]
-    relationship(map: &map)
-    return map
-  }
-  
-  private func relationship(map: inout [String : Node]) {
-    guard let inOpDesc = opDesc else {
-      return
+    private func to(depth: UInt, withNode: Node) {
+        if depth < 1 {
+            return
+        }
+        
+        for output in outputs {
+            let node = Node.init(inType: output.type)
+            node.opDesc = output.opDesc
+            withNode.outputs.append(node)
+            output.to(depth: depth - 1, withNode: node)
+        }
     }
     
-    for output in inOpDesc.outputs {
-      for outputKey in output.value {
-        map[outputKey] = self
-      }
+    func relationship() -> [String : Node]{
+        var map: [String : Node] = [:]
+        relationship(map: &map)
+        return map
     }
     
-    for output in outputs {
-      output.relationship(map: &map)
+    private func relationship(map: inout [String : Node]) {
+        guard let inOpDesc = opDesc else {
+            return
+        }
+        
+        for output in inOpDesc.outputs {
+            for outputKey in output.value {
+                map[outputKey] = self
+            }
+        }
+        
+        for output in outputs {
+            output.relationship(map: &map)
+        }
     }
-  }
-  
+    
 }
 
 extension Node: Equatable {
-  static func == (lhs: Node, rhs: Node) -> Bool {
-    if lhs.outputs.count != rhs.outputs.count {
-      return false
-    }
-    
-    if lhs.type != rhs.type {
-      return false
+    static func == (lhs: Node, rhs: Node) -> Bool {
+        if lhs.outputs.count != rhs.outputs.count {
+            return false
+        }
+        
+        if lhs.type != rhs.type {
+            return false
+        }
+        
+        for i in 0..<lhs.outputs.count {
+            if lhs.outputs[i] != rhs.outputs[i] {
+                return false
+            }
+        }
+        return true
     }
     
-    for i in 0..<lhs.outputs.count {
-      if lhs.outputs[i] != rhs.outputs[i] {
-        return false
-      }
-    }
-    return true
-  }
-  
 }
 
 class ProgramOptimize<P: PrecisionType> {
-  // register fusion
-  let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
-//                                  ConvAddAddPreluOp<P>.self,
-                                  ConvAddPreluOp<P>.self,
-                                  ConvAddOp<P>.self,
-                                  ConvBNReluOp<P>.self,
-                                  DwConvBNReluOp<P>.self,
-                                  ElementwiseAddPreluOp<P>.self
-  ]
-  
-  func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc {
+    // register fusion
+    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+                                    //                                  ConvAddAddPreluOp<P>.self,
+        ConvAddPreluOp<P>.self,
+        ConvAddOp<P>.self,
+        ConvBNReluOp<P>.self,
+        DwConvBNReluOp<P>.self,
+        ElementwiseAddPreluOp<P>.self
+    ]
     
-    guard originProgramDesc.blocks.count == 1 else {
-      fatalError(" not support yet")
-    }
-    
-    var mapForNodeChain: [String : Node] = [:]
-    var nodes: [Node] = []
-    var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
-    let block = originProgramDesc.blocks[0]
-    for opDesc in block.ops {
-        print(opDesc.type)
-      guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
-        fatalError()
-      }
-      
-      let node = Node.init(inOpDesc: opDesc)
-      for inputKey in opInputKeys {
-        if let inputs = opDesc.inputs[inputKey] {
-          for input in inputs {
-            if let inputNode = mapForNodeChain[input] {
-              _ = inputNode --> node
-            }
-          }
+    func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc {
+        
+        guard originProgramDesc.blocks.count == 1 else {
+            fatalError(" not support yet")
         }
-      }
-      
-      for outputKey in outputKeys {
-        if let outputs = opDesc.outputs[outputKey] {
-          for output in outputs {
-            mapForNodeChain[output] = node
-          }
-        }
-      }
-      
-      nodes.append(node)
-      
-      if var inNodes = typeMapNodes[opDesc.type] {
-        inNodes.append((node, mapForNodeChain))
-        typeMapNodes[opDesc.type] = inNodes
-      } else {
-        typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
-      }
-    }
-    
-    for fusion in fusionOps {
-      let fusionNode = fusion.fusionNode()
-      let depth = fusionNode.depth()
-      if let toMatchNodes = typeMapNodes[fusionNode.type] {
-        for node in toMatchNodes {
-          
-          let toNode = node.node.to(depth: depth)
-          if toNode == fusionNode {   // match
-            var canFolder = true
-            let relationshipMap = toNode.relationship()
+        
+        var mapForNodeChain: [String : Node] = [:]
+        var nodes: [Node] = []
+        var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
+        let block = originProgramDesc.blocks[0]
+        for opDesc in block.ops {
+            print(opDesc.type)
+            guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
+                fatalError()
+            }
             
-            for toCheck in fusion.needCheck() {
-              //              let nodes = toCheck
-              let checkNodes = toNode[toCheck.0]
-              
-              for checkNode in checkNodes {
-                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
-                for inputToCheck in inputToChecks {
-                  if node.output[inputToCheck] == nil {
-                    if relationshipMap[inputToCheck] == nil {
-                      canFolder = false
+            let node = Node.init(inOpDesc: opDesc)
+            for inputKey in opInputKeys {
+                if let inputs = opDesc.inputs[inputKey] {
+                    for input in inputs {
+                        if let inputNode = mapForNodeChain[input] {
+                            _ = inputNode --> node
+                        }
                     }
-                  }
                 }
-                
-                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
-                for paramInputToCheck in paramInputToChecks {
-                  if node.output[paramInputToCheck] == nil {
-                    if relationshipMap[paramInputToCheck] == nil {
-                      canFolder = false
+            }
+            
+            for outputKey in outputKeys {
+                if let outputs = opDesc.outputs[outputKey] {
+                    for output in outputs {
+                        mapForNodeChain[output] = node
                     }
-                  }
                 }
-              }
             }
             
-            if !canFolder {
-              continue
-            }
+            nodes.append(node)
             
-            var removeNodes: [Node] = []
-            node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
-            for removeNode in removeNodes {
-              nodes.remove(element: removeNode)
+            if var inNodes = typeMapNodes[opDesc.type] {
+                inNodes.append((node, mapForNodeChain))
+                typeMapNodes[opDesc.type] = inNodes
+            } else {
+                typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
             }
-          }
         }
-      }
-    }
-    
-    var ops: [PMOpDesc] = []
-    for node in nodes {
-      ops.append(node.opDesc!)
+        
+        for fusion in fusionOps {
+            let fusionNode = fusion.fusionNode()
+            let depth = fusionNode.depth()
+            if let toMatchNodes = typeMapNodes[fusionNode.type] {
+                for node in toMatchNodes {
+                    
+                    let toNode = node.node.to(depth: depth)
+                    if toNode == fusionNode {   // match
+                        var canFolder = true
+                        let relationshipMap = toNode.relationship()
+                        
+                        for toCheck in fusion.needCheck() {
+                            //              let nodes = toCheck
+                            let checkNodes = toNode[toCheck.0]
+                            
+                            for checkNode in checkNodes {
+                                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
+                                for inputToCheck in inputToChecks {
+                                    if node.output[inputToCheck] == nil {
+                                        if relationshipMap[inputToCheck] == nil {
+                                            canFolder = false
+                                        }
+                                    }
+                                }
+                                
+                                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
+                                for paramInputToCheck in paramInputToChecks {
+                                    if node.output[paramInputToCheck] == nil {
+                                        if relationshipMap[paramInputToCheck] == nil {
+                                            canFolder = false
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        
+                        if !canFolder {
+                            continue
+                        }
+                        
+                        var removeNodes: [Node] = []
+                        node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
+                        for removeNode in removeNodes {
+                            nodes.remove(element: removeNode)
+                        }
+                    }
+                }
+            }
+        }
+        
+        var ops: [PMOpDesc] = []
+        for node in nodes {
+            ops.append(node.opDesc!)
+        }
+        
+        let newProgramDesc = PMProgramDesc.init()
+        let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops)
+        newProgramDesc.blocks.append(newBlock)
+        return newProgramDesc
     }
-    
-    let newProgramDesc = PMProgramDesc.init()
-    let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops)
-    newProgramDesc.blocks.append(newBlock)
-    return newProgramDesc
-  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
index d73eefd096b32e06bff5ac82f7fb3aa16fce825e..478867b08ce80ecde1bf85913fb35de434b54f9c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift
@@ -48,7 +48,7 @@ public class Scope {
         }
         
     }
-
+    
     func clear(){
         vars.removeAll()
     }