merge metal branch to pointer

b7a1d552 · xiaohaichun · 13c9388c · 1c72c0ef · b7a1d552 · 13c9388c
91 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@
 *.lai
 *.la
 *.lib
+*.a
 # Executables
 *.exe

--- a/metal/PreluKernel/PreluKernel.xcodeproj/project.pbxproj
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/project.pbxproj
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-/* Begin PBXBuildFile section */
-		FCEB6843212F00CC00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6842212F00CC00D2448E /* PreluKernel.metal */; };
-/* End PBXBuildFile section */
-/* Begin PBXFileReference section */
-		FCEB683F212F00CC00D2448E /* PreluKernel.metallib */ = {isa = PBXFileReference; explicitFileType = "archive.metal-library"; includeInIndex = 0; path = PreluKernel.metallib; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCEB6842212F00CC00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-/* Begin PBXGroup section */
-		FCEB6838212F00CC00D2448E = {
-			isa = PBXGroup;
-			children = (
-				FCEB6841212F00CC00D2448E /* PreluKernel */,
-				FCEB6840212F00CC00D2448E /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		FCEB6840212F00CC00D2448E /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FCEB683F212F00CC00D2448E /* PreluKernel.metallib */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FCEB6841212F00CC00D2448E /* PreluKernel */ = {
-			isa = PBXGroup;
-			children = (
-				FCEB6842212F00CC00D2448E /* PreluKernel.metal */,
-			);
-			path = PreluKernel;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-/* Begin PBXNativeTarget section */
-		FCEB683E212F00CC00D2448E /* PreluKernel */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FCEB6846212F00CC00D2448E /* Build configuration list for PBXNativeTarget "PreluKernel" */;
-			buildPhases = (
-				FCEB683D212F00CC00D2448E /* Sources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = PreluKernel;
-			productName = PreluKernel;
-			productReference = FCEB683F212F00CC00D2448E /* PreluKernel.metallib */;
-			productType = "com.apple.product-type.metal-library";
-		};
-/* End PBXNativeTarget section */
-/* Begin PBXProject section */
-		FCEB6839212F00CC00D2448E /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0940;
-				ORGANIZATIONNAME = orange;
-				TargetAttributes = {
-					FCEB683E212F00CC00D2448E = {
-						CreatedOnToolsVersion = 9.4.1;
-					};
-				};
-			};
-			buildConfigurationList = FCEB683C212F00CC00D2448E /* Build configuration list for PBXProject "PreluKernel" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-			);
-			mainGroup = FCEB6838212F00CC00D2448E;
-			productRefGroup = FCEB6840212F00CC00D2448E /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FCEB683E212F00CC00D2448E /* PreluKernel */,
-			);
-		};
-/* End PBXProject section */
-/* Begin PBXSourcesBuildPhase section */
-		FCEB683D212F00CC00D2448E /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FCEB6843212F00CC00D2448E /* PreluKernel.metal in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-/* Begin XCBuildConfiguration section */
-		FCEB6844212F00CC00D2448E /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				SDKROOT = iphoneos;
-			};
-			name = Debug;
-		};
-		FCEB6845212F00CC00D2448E /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-			};
-			name = Release;
-		};
-		FCEB6847212F00CC00D2448E /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Debug;
-		};
-		FCEB6848212F00CC00D2448E /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-/* Begin XCConfigurationList section */
-		FCEB683C212F00CC00D2448E /* Build configuration list for PBXProject "PreluKernel" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCEB6844212F00CC00D2448E /* Debug */,
-				FCEB6845212F00CC00D2448E /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FCEB6846212F00CC00D2448E /* Build configuration list for PBXNativeTarget "PreluKernel" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FCEB6847212F00CC00D2448E /* Debug */,
-				FCEB6848212F00CC00D2448E /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FCEB6839212F00CC00D2448E /* Project object */;
-}
--- a/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/contents.xcworkspacedata
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:PreluKernel.xcodeproj">
-   </FileRef>
-</Workspace>
--- a/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
--- a/metal/PreluKernel/PreluKernel.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>SchemeUserState</key>
-	<dict>
-		<key>PreluKernel.xcscheme</key>
-		<dict>
-			<key>orderHint</key>
-			<integer>0</integer>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/metal/PreluKernel/PreluKernel/PreluKernel.metal
+++ b/metal/PreluKernel/PreluKernel/PreluKernel.metal
-//
-//  PreluKernel.metal
-//  PreluKernel
-//
-//  Created by liuRuiLong on 2018/8/23.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-#include <metal_stdlib>
-using namespace metal;
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
@@ -11,6 +11,34 @@
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
    </dependencies>
    <scenes>
+        <!--Multi Predict View Controller-->
+        <scene sceneID="ec4-AW-9Vs">
+            <objects>
+                <viewController id="Vwd-lt-764" customClass="MultiPredictViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="55D-rz-Ex6">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="TQt-X9-PdF">
+                                <rect key="frame" x="164" y="318" width="46" height="30"/>
+                                <state key="normal" title="Button"/>
+                                <connections>
+                                    <action selector="predictAct:" destination="Vwd-lt-764" eventType="touchUpInside" id="d4z-Cv-6jY"/>
+                                </connections>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerY" secondItem="55D-rz-Ex6" secondAttribute="centerY" id="bL3-wr-TcH"/>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerX" secondItem="55D-rz-Ex6" secondAttribute="centerX" id="sBi-RQ-sJn"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="bsd-h4-RYZ"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="68E-SG-96s" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-559" y="686"/>
+        </scene>
        <!--View Controller-->
        <scene sceneID="tne-QT-ifu">
            <objects>
@@ -20,9 +48,9 @@
                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                        <subviews>
                            <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
-                                <rect key="frame" x="0.0" y="20" width="375" height="247"/>
+                                <rect key="frame" x="0.0" y="20" width="225" height="247"/>
                            </imageView>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Thread:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Platform:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
                                <rect key="frame" x="10" y="538" width="68" height="24"/>
                                <constraints>
                                    <constraint firstAttribute="width" constant="68" id="Q5J-tq-JSX"/>
@@ -142,9 +170,14 @@
                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Cil-py-NiA">
+                                <rect key="frame" x="225" y="20" width="150" height="247"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </view>
                        </subviews>
                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                        <constraints>
+                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="Cil-py-NiA" secondAttribute="bottom" constant="10" id="16p-IK-b5X"/>
                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="VQn-bS-fWp" secondAttribute="trailing" constant="10" id="1Xg-0h-9SE"/>
                            <constraint firstItem="avL-VK-Kha" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="2t9-hS-VXa"/>
                            <constraint firstItem="R90-Yf-S6g" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="76b-Ny-1Og"/>
@@ -159,11 +192,12 @@
                            <constraint firstItem="XpL-9M-UOp" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KWW-qT-Rzf"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="centerY" secondItem="avL-VK-Kha" secondAttribute="centerY" id="KZa-YZ-DEs"/>
                            <constraint firstItem="2EB-m2-a3L" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="Le3-TN-zOL"/>
-                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="MeS-HQ-voE"/>
+                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" constant="-150" id="MeS-HQ-voE"/>
                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="ZZh-fw-LwK" secondAttribute="bottom" constant="10" id="NUL-Ta-VI8"/>
                            <constraint firstItem="m5L-O7-P31" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="RFA-z1-9aB"/>
                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="a3K-ri-NVs" secondAttribute="width" id="Rp6-Bh-BN3"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="S0W-0G-75m"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="UNc-Et-9Yv"/>
                            <constraint firstItem="w7H-Sk-Rai" firstAttribute="leading" secondItem="wUL-9N-u1V" secondAttribute="trailing" id="VBM-8b-jP0"/>
                            <constraint firstItem="VQn-bS-fWp" firstAttribute="top" secondItem="m5L-O7-P31" secondAttribute="bottom" constant="8" id="VpS-4N-mOo"/>
                            <constraint firstItem="wUL-9N-u1V" firstAttribute="top" secondItem="2EB-m2-a3L" secondAttribute="bottom" constant="35" id="VpU-j2-gaE"/>
@@ -175,10 +209,12 @@
                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="eIC-fZ-OEE"/>
                            <constraint firstItem="976-fk-Kx2" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="fFg-pB-eyU"/>
                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="40" id="fG6-0p-I0P"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="gGK-DB-ibv"/>
                            <constraint firstItem="XpL-9M-UOp" firstAttribute="leading" secondItem="w7H-Sk-Rai" secondAttribute="trailing" id="guC-Db-cA9"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="leading" secondItem="avL-VK-Kha" secondAttribute="trailing" constant="10" id="jNW-iC-u7V"/>
                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="bottom" secondItem="6Tk-OE-BBY" secondAttribute="bottom" id="o1X-q5-P7j"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="top" secondItem="VQn-bS-fWp" secondAttribute="bottom" constant="8" id="tAE-ss-jlA"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="leading" secondItem="ZZh-fw-LwK" secondAttribute="trailing" id="teJ-PP-h2R"/>
                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="top" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="10" id="udc-wT-jqd"/>
                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" id="vXI-l2-CjL"/>
                            <constraint firstItem="VQn-bS-fWp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="wtI-Dl-YPq"/>
@@ -195,11 +231,12 @@
                        <outlet property="resultTextView" destination="VQn-bS-fWp" id="306-c7-3vM"/>
                        <outlet property="selectImageView" destination="ZZh-fw-LwK" id="afR-Bv-6AW"/>
                        <outlet property="threadPickerView" destination="DlO-dk-RMr" id="Kk4-QV-b5o"/>
+                        <outlet property="videoView" destination="Cil-py-NiA" id="QY2-BP-SNS"/>
                    </connections>
                </viewController>
                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
            </objects>
-            <point key="canvasLocation" x="-724" y="98.50074962518741"/>
+            <point key="canvasLocation" x="-1127" y="-3"/>
        </scene>
    </scenes>
    <resources>

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+//
+//  Multi-Predict-ViewController.swift
+//  paddle-mobile-demo
+//
+//  Created by liuRuiLong on 2018/9/14.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+import UIKit
+import paddle_mobile
+class MultiPredictViewController: UIViewController {
+  var runner1: Runner!
+  var runner2: Runner!
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
+    let genet = Genet.init(device: MetalHelper.shared.device)
+    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    let queue2 = MetalHelper.shared.device.makeCommandQueue()
+    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+  }
+  @IBAction func predictAct(_ sender: Any) {
+    let success = self.runner2.load()
+//    DispatchQueue.global().async {
+      let image1 = UIImage.init(named: "hand.jpg")
+//      let success = self.runner2.load()
+//      if success {
+//        for i in 0..<10000 {
+//          print(i)
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result1: ")
+////            print(res)
+//          })
+//        }
+//      } else {
+//        print("load failed")
+//      }
+//      self.runner1.clear()
+//    }
+//    return
+//    DispatchQueue.global().async {
+////      sleep(1)
+//      let image1 = UIImage.init(named: "banana.jpeg")
+////      if success {
+//        for _ in 0..<10 {
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result2: ")
+//            print(res)
+//          })
+//        }
+////      } else {
+////        print("load failed")
+////      }
+////      self.runner2.clear()
+//    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+import Foundation
+import QuartzCore
+public class FPSCounter {
+  private(set) public var fps: Double = 0
+  var frames = 0
+  var startTime: CFTimeInterval = 0
+  public func start() {
+    frames = 0
+    startTime = CACurrentMediaTime()
+  }
+  public func frameCompleted() {
+    frames += 1
+    let now = CACurrentMediaTime()
+    let elapsed = now - startTime
+    if elapsed > 0.1 {
+      let current = Double(frames) / elapsed
+      let smoothing = 0.75
+      fps = smoothing*fps + (1 - smoothing)*current
+      if elapsed > 1 {
+        frames = 0
+        startTime = CACurrentMediaTime()
+      }
+    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+import UIKit
+import Metal
+import CoreVideo
+import AVFoundation
+@available(iOS 10.0, *)
+@objc public protocol VideoCaptureDelegate: NSObjectProtocol {
+  @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
+}
+/**
+  Simple interface to the iPhone's camera.
+*/
+@available(iOS 10.0, *)
+public class VideoCapture: NSObject {
+    public var previewLayer: AVCaptureVideoPreviewLayer?
+    public weak var delegate: VideoCaptureDelegate?
+    public var fps = -1
+    private let device: MTLDevice?
+    private let videoOrientation: AVCaptureVideoOrientation
+    private var textureCache: CVMetalTextureCache?
+    private let captureSession = AVCaptureSession()
+    private let videoOutput = AVCaptureVideoDataOutput()
+    private let photoOutput = AVCapturePhotoOutput()
+    private let queue = DispatchQueue(label: "net.machinethink.camera-queue")
+    private var lastTimestamp = CMTime()
+    private let cameraPosition: AVCaptureDevice.Position
+    public init(device: MTLDevice? = nil, orientation: AVCaptureVideoOrientation = .portrait, position: AVCaptureDevice.Position = .back) {
+        self.device = device
+        self.videoOrientation = orientation
+        self.cameraPosition = position
+        super.init()
+    }
+    public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
+                    completion: @escaping (Bool) -> Void) {
+        queue.async {
+            let success = self.setUpCamera(sessionPreset: sessionPreset)
+            DispatchQueue.main.async {
+                completion(success)
+            }
+        }
+    }
+    func fontCamera() -> AVCaptureDevice? {
+        let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
+        return deveices.first
+    }
+    func setUpCamera(sessionPreset: AVCaptureSession.Preset) -> Bool {
+        if let inDevice = device{
+            guard CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, inDevice, nil, &textureCache) == kCVReturnSuccess else {
+                print("Error: could not create a texture cache")
+                return false
+            }
+        }
+        captureSession.beginConfiguration()
+        captureSession.sessionPreset = sessionPreset
+        var oCaptureDevice: AVCaptureDevice?
+        switch cameraPosition {
+        case .back:
+            oCaptureDevice = AVCaptureDevice.default(for: AVMediaType.video)
+            break
+        case .front:
+            oCaptureDevice = fontCamera()
+            break
+        default:
+            break
+        }
+        guard let captureDevice = oCaptureDevice else {
+            print("Error: no video devices available")
+            return false
+        }
+        guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
+            print("Error: could not create AVCaptureDeviceInput")
+            return false
+        }
+        if captureSession.canAddInput(videoInput) {
+            captureSession.addInput(videoInput)
+        }
+        let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
+        previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
+        previewLayer.connection?.videoOrientation = self.videoOrientation
+        self.previewLayer = previewLayer
+        let settings: [String : Any] = [
+        kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
+        ]
+        videoOutput.videoSettings = settings
+        videoOutput.alwaysDiscardsLateVideoFrames = true
+        videoOutput.setSampleBufferDelegate(self, queue: queue)
+        if captureSession.canAddOutput(videoOutput) {
+            captureSession.addOutput(videoOutput)
+        }
+        // We want the buffers to be in portrait orientation otherwise they are
+        // rotated by 90 degrees. Need to set this _after_ addOutput()!
+        videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
+        if captureSession.canAddOutput(photoOutput) {
+            captureSession.addOutput(photoOutput)
+        }
+        captureSession.commitConfiguration()
+        return true
+    }
+    public func start() {
+        if !captureSession.isRunning {
+            captureSession.startRunning()
+        }
+    }
+    public func stop() {
+        if captureSession.isRunning {
+            captureSession.stopRunning()
+        }
+    }
+    /* Captures a single frame of the camera input. */
+    public func capturePhoto() {
+        let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
+        settings.previewPhotoFormat = [
+            kCVPixelBufferPixelFormatTypeKey as String: settings.__availablePreviewPhotoPixelFormatTypes[0],
+            kCVPixelBufferWidthKey as String: 480,
+            kCVPixelBufferHeightKey as String: 360,
+        ]
+        photoOutput.capturePhoto(with: settings, delegate: self)
+    }
+    func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
+        if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            var texture: CVMetalTexture?
+            CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache, imageBuffer, nil, .bgra8Unorm, width, height, 0, &texture)
+            if let texture = texture {
+                return CVMetalTextureGetTexture(texture)
+            }
+        }
+        return nil
+    }
+    func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
+        if let sampleBuffer = sampleBuffer,
+            let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            let rect = CGRect(x: 0, y: 0, width: CGFloat(width), height: CGFloat(height))
+            let ciImage = CIImage(cvPixelBuffer: imageBuffer)
+            let ciContext = CIContext(options: nil)
+            if let cgImage = ciContext.createCGImage(ciImage, from: rect) {
+                return UIImage(cgImage: cgImage)
+            }
+        }
+        return nil
+    }
+}
+extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
+  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    // Because lowering the capture device's FPS looks ugly in the preview,
+    // we capture at full speed but only call the delegate at its desired
+    // framerate. If `fps` is -1, we run at the full framerate.
+    let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+    let deltaTime = timestamp - lastTimestamp
+    if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
+        lastTimestamp = timestamp
+        self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
+            let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
+            delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+        }
+    }
+  }
+  public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    print("dropped frame")
+  }
+}
+extension VideoCapture: AVCapturePhotoCaptureDelegate {
+  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
+                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
+                          previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
+                          resolvedSettings: AVCaptureResolvedPhotoSettings,
+                          bracketSettings: AVCaptureBracketedStillImageSettings?,
+                          error: Error?) {
+    var imageTexture: MTLTexture?
+    var previewImage: UIImage?
+    if error == nil {
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
+            imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
+        }
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
+            previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+        }
+    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -14,27 +14,32 @@
 import UIKit
 import MetalKit
+import CoreMedia
 import paddle_mobile
 import MetalPerformanceShaders
-let platform: Platform = .GPU
+var platform: Platform = .GPU
-let threadSupport = [1]
+let threadSupport: [(Platform, String)] = [(.GPU, "GPU"), (.CPU, "CPU")]
-let modelHelperMap: [SupportModel : Runner] = [.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+//.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+let modelHelperMap: [SupportModel : Runner] = [
                                               .genet : Runner.init(inNet: Genet.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
                                               .mobilenet_ssd_ar : Runner.init(inNet: MobileNet_ssd_AR.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform)]
 //, .genet : Genet.init()
 //let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
+let netSupport: [SupportModel : Net] = [.genet : Genet.init(device: MetalHelper.shared.device), .mobilenet_ssd_ar : MobileNet_ssd_AR.init(device: MetalHelper.shared.device)]
 enum SupportModel: String{
  //  case mobilenet = "mobilenet"
-  case mobilenet_ssd    = "mobilenetssd"
+//  case mobilenet_ssd    = "mobilenetssd"
  case genet            = "genet"
  case mobilenet_ssd_ar = "mobilenetssd_ar"
  static func supportedModels() -> [SupportModel] {
-    //.mobilenet,
+    // .mobilenet,
-    return [.mobilenet_ssd, .genet, .mobilenet_ssd_ar]
+    // .mobilenet_ssd,
+    return [.genet, .mobilenet_ssd_ar]
  }
 }
@@ -44,24 +49,36 @@ class ViewController: UIViewController {
  @IBOutlet weak var elapsedTimeLabel: UILabel!
  @IBOutlet weak var modelPickerView: UIPickerView!
  @IBOutlet weak var threadPickerView: UIPickerView!
+  @IBOutlet weak var videoView: UIView!
+  var videoCapture: VideoCapture!
  var selectImage: UIImage?
  var inputPointer: UnsafeMutablePointer<Float32>?
  var modelType: SupportModel = SupportModel.supportedModels()[0]
  var toPredictTexture: MTLTexture?
-  var runner: Runner {
+  var runner: Runner!
+  var threadNum = 1
+  @IBAction func loadAct(_ sender: Any) {
+     runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue, inPlatform: platform)
+    if platform == .CPU {
+      if inputPointer == nil {
+        inputPointer = runner.preproccess(image: selectImage!.cgImage!)
-    get {
-      return modelHelperMap[modelType] ?! " has no this type "
      }
-    set {
+    } else if platform == .GPU {
+      if self.toPredictTexture == nil {
+        runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+          self?.toPredictTexture = texture
        }
      }
+    } else {
+      fatalError( " unsupport " )
+    }
-  var threadNum = 1
-  @IBAction func loadAct(_ sender: Any) {
    if runner.load() {
      print(" load success ! ")
    } else {
@@ -81,7 +98,7 @@ class ViewController: UIViewController {
  }
  @IBAction func predictAct(_ sender: Any) {
-    let max = 1
+    let max = 50
    switch platform {
    case .GPU:
      guard let inTexture = toPredictTexture else {
@@ -91,7 +108,7 @@ class ViewController: UIViewController {
      let startDate = Date.init()
      for i in 0..<max {
-        runner.predict(texture: inTexture) { [weak self] (success, res) in
+        runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
          guard let sSelf = self else {
            fatalError()
          }
@@ -99,11 +116,19 @@ class ViewController: UIViewController {
            if i == max - 1 {
              let time = Date.init().timeIntervalSince(startDate)
              DispatchQueue.main.async {
-                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
+//                print(resultHolder!.result![0])
+                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
              }
            }
          }
+          DispatchQueue.main.async {
+            resultHolder?.releasePointer()
+          }
+//            print("释放")
        }
 //        print("sleep before ")
 //        usleep(33000)
@@ -116,6 +141,7 @@ class ViewController: UIViewController {
      for _ in 0..<10 {
        runner.predict(inputPointer: inInputPointer) { (success, res) in
+          res?.releaseOutput()
        }
      }
@@ -129,11 +155,12 @@ class ViewController: UIViewController {
            if i == max - 1 {
              let time = Date.init().timeIntervalSince(startDate)
              DispatchQueue.main.async {
-                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
+//                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
              }
            }
          }
+          res?.releaseOutput()
        }
      }
    }
@@ -141,6 +168,13 @@ class ViewController: UIViewController {
  override func viewDidLoad() {
    super.viewDidLoad()
+//    if runner.load() {
+//      print(" load success ! ")
+//    } else {
+//      print(" load error ! ")
+//    }
+//    
    modelPickerView.delegate = self
    modelPickerView.dataSource = self
    threadPickerView.delegate = self
@@ -149,15 +183,29 @@ class ViewController: UIViewController {
    selectImage = UIImage.init(named: "hand.jpg")
    selectImageView.image = selectImage
-    if platform == .CPU {
+//    if platform == .CPU {
-      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+//      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
-    } else if platform == .GPU {
+//    } else if platform == .GPU {
-      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+//      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
-        self?.toPredictTexture = texture
+//        self?.toPredictTexture = texture
-      }
+//      }
-    } else {
+//    } else {
-      fatalError( " unsupport " )
+//      fatalError( " unsupport " )
-    }
+//    }
+//    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
+//    videoCapture.fps = 30
+//    videoCapture.delegate = self
+//    videoCapture.setUp { (success) in
+//      DispatchQueue.main.async {
+//        if let preViewLayer = self.videoCapture.previewLayer {
+//          self.videoView.layer.addSublayer(preViewLayer)
+//          self.videoCapture.previewLayer?.frame = self.videoView.bounds
+//        }
+//        self.videoCapture.start()
+//      }
+//    }
  }
 }
@@ -186,7 +234,7 @@ extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
    if pickerView == modelPickerView {
      return SupportModel.supportedModels()[row].rawValue
    } else if pickerView == threadPickerView {
-      return "\(threadSupport[row])"
+      return threadSupport[row].1
    } else {
      fatalError()
    }
@@ -196,7 +244,8 @@ extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
    if pickerView == modelPickerView {
      self.modelType = SupportModel.supportedModels()[row]
    } else if pickerView == threadPickerView {
-      self.threadNum = threadSupport[row]
+      platform = threadSupport[row].0
    } else {
      fatalError()
    }
@@ -218,4 +267,32 @@ extension ViewController:  UIImagePickerControllerDelegate, UINavigationControll
  }
 }
+var bool1 = false
+extension ViewController: VideoCaptureDelegate{
+  func predictTexture(texture: MTLTexture){
+    runner.scaleTexture(input: texture) { (scaledTexture) in
+      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
+//        print(resultHolder!.result![0])
+        resultHolder?.releasePointer()
+      })
+    }
+  }
+  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
+//    if !bool1 {
+//      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
+    self.predictTexture(texture: texture!)
+//      }
+//      bool1 = true
+//    }
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile.xcscheme
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile.xcscheme
@@ -33,7 +33,7 @@
      </AdditionalOptions>
   </TestAction>
   <LaunchAction
-      buildConfiguration = "Release"
+      buildConfiguration = "Debug"
      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
      launchStyle = "0"

--- a/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobile.h
+++ b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobile.h
@@ -17,7 +17,17 @@
 #import <CoreImage/CoreImage.h>
 #import <Foundation/Foundation.h>
-@interface PaddleMobile : NSObject
+@interface PaddleMobileCPUResult: NSObject
+@property (assign, nonatomic, readonly) float *output;
+@property (assign, nonatomic, readonly) int outputSize;
+-(void)releaseOutput;
+@end
+@interface PaddleMobileCPU : NSObject
 /*
    创建对象
@@ -42,25 +52,8 @@
         andModelParamsLen:(size_t)combinedParamsLen
      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
-/*
- *   进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
- */
- (NSArray *)predict:(CGImageRef)image
-                 dim:(NSArray<NSNumber *> *)dim
-               means:(NSArray<NSNumber *> *)means
-               scale:(float)scale;
-/*
- *  预测输入
- * */
- (NSArray *)predictInput:(float *)input
-                      dim:(NSArray<NSNumber *> *)dim
-                    means:(NSArray<NSNumber *> *)means
-                    scale:(float)scale;
 /*
- *  对图像进行预处理
+ *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
 * */
 -(void)preprocess:(CGImageRef)image
           output:(float *)output
@@ -68,6 +61,22 @@
        scale:(float)scale
        dim:(NSArray<NSNumber *> *)dim;
+/*
+ * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ * */
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                                    dim:(NSArray<NSNumber *> *)dim;
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+/*
+    进行预测, 默认 means 为 0, scale 为 1.0
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
 /*
    清理内存
 */

--- a/metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
+++ b/metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.h
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
@@ -16,6 +16,12 @@
 #import <Foundation/Foundation.h>
+@interface CPUResult: NSObject
+@property (assign, nonatomic) float *output;
+@property (assign, nonatomic) int outputSize;
+@end
 @interface NMSCompute: NSObject
 @property (assign, nonatomic) float scoreThredshold;
@@ -34,6 +40,6 @@
 @property (strong, nonatomic) NSArray<NSNumber *> *bboxDim;
-(NSArray<NSNumber *> *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
 @end
--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
@@ -21,6 +21,8 @@
 #import <algorithm>
 struct NMSParam {
  float *score_data;
@@ -282,9 +284,12 @@ void MultiClassNMSCompute(NMSParam *param) {
  param->output_size = output_size;
 }
+@implementation CPUResult
+@end
 @implementation NMSCompute
-(NSArray<NSNumber *> *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
  NMSParam param;
  param.box_data = bbox;
  param.score_data = score;
@@ -306,12 +311,10 @@ void MultiClassNMSCompute(NMSParam *param) {
  }
  param.box_dim = box_dim;
  MultiClassNMSCompute(&param);
-  NSMutableArray<NSNumber *> *output = [NSMutableArray arrayWithCapacity:param.output_size];
+  CPUResult *cr = [[CPUResult alloc] init];
-  for (int i = 0; i < param.output_size; ++i) {
+  cr.output = param.output;
-    [output addObject:[NSNumber numberWithFloat:param.output[i]]];
+  cr.outputSize = param.output_size;
-  }
+  return cr;
-  delete param.output;
-  return output;
 }
 @end

--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -71,7 +71,128 @@ extension MTLDevice {
    return buffer!
  }
+  func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
+    let bpR = texture.width * 4 * MemoryLayout<P>.size
+    let bpI = texture.height * bpR
+    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
+    for i in 0..<texture.arrayLength {
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
+      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+      for tx in 0..<texture.width * texture.height * 4 {
+        var k = tx
+        var xyzn: [Int] = [0, 0, 0, 0]
+        xyzn[1] = k / (texture.width * 4)
+        k %= (texture.width * 4)
+        xyzn[3] = k % 4
+        xyzn[0] = k / 4
+        xyzn[2] = i
+        cb(xyzn, pointer[tx])
+      }
+    }
+  }
+  func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 3)
+    assert(texture.width == ndim[3])
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(texture.arrayLength == (ndim[1] + 3) / 4)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[2] * 4 + xyzn[3]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 2)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(texture.arrayLength == 1)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 1)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == 1)
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(ndim[2] == 1)
+    assert(texture.arrayLength == 1)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
  func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    if dim.count == 3 {
+      return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 2 {
+      return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 1 {
+      return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
+    }
    var tdim: [Int] = [1, 1, 1, 1]
    for i in 0..<dim.count {
      tdim[4 - dim.count + i] = dim[i]
@@ -84,30 +205,19 @@ extension MTLDevice {
    assert(texture.height == ndim[1])
    assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
-    let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
-    let bpI = ndim[1] * bpR
+      var tg: [Int] = [0, 0, 0, 0]
-    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+      tg[1] = xyzn[1]
-    for i in 0..<texture.arrayLength {
+      tg[2] = xyzn[0]
-      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: ndim[1] * ndim[2] * 4 * MemoryLayout<P>.size)
+      tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
-      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+      tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
+      var ig: [Int] = [0, 0, 0, 0]
-      for h in 0..<ndim[1] {
-        for w in 0..<ndim[2] {
      for k in 0..<4 {
-            let tx = (h * ndim[2] + w) * 4 + k
+        ig[transpose[k]] = tg[k]
-            let n = (i * 4 + k) / ndim[3]
-            let c = (i * 4 + k) % ndim[3]
-            let jg = [n, h, w, c]
-            var ig = [0, 0, 0, 0]
-            for d in 0..<4 {
-              ig[transpose[d]] = jg[d]
      }
      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
      if ix < count {
-              tensor[ix] = pointer[tx]
+        tensor[ix] = v
-            }
-          }
-        }
      }
    }
    return tensor

--- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
@@ -83,38 +83,38 @@ public class PaddleMobileUnitTest {
    }
    public func testConcat() {
-        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
-        var it: [[Float32]] = []
+//        var it: [[Float32]] = []
-        for _ in 0..<7 {
+//        for _ in 0..<7 {
-            it.append((0..<12).map { Float32($0) })
+//            it.append((0..<12).map { Float32($0) })
-        }
+//        }
-        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
+//        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
-        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
+//        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
+//
-        let param = ConcatTestParam.init(
+//        let param = ConcatTestParam.init(
-            input: input,
+//            input: input,
-            output: output,
+//            output: output,
-            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
+//            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
-            axis: 1,
+//            axis: 1,
-            odim: [3, 28]
+//            odim: [3, 28]
-        )
+//        )
-        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
+//        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
-        concatKernel.test(cmdBuffer: buffer, param: param)
+//        concatKernel.test(cmdBuffer: buffer, param: param)
-        buffer.addCompletedHandler { (buffer) in
+//        buffer.addCompletedHandler { (buffer) in
-            for i in 0..<it.count {
+//            for i in 0..<it.count {
-                let _: Float32? = input[i].logDesc()
+//                let _: Float32? = input[i].logDesc()
-                self.tensorPrint(tensor: it[i], dim: [3, 4])
+//                self.tensorPrint(tensor: it[i], dim: [3, 4])
-            }
+//            }
-            let _: Float32? = output.logDesc()
+//            let _: Float32? = output.logDesc()
-            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
+//            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
-            self.tensorPrint(tensor: tx, dim: [3, 28])
+//            self.tensorPrint(tensor: tx, dim: [3, 28])
-        }
+//        }
+//
-        buffer.commit()
+//        buffer.commit()
    }
    public func testReshape() {
-        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
 //        let input: [Float32] = (0..<24).map { Float32($0) }
 //        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
 //        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
@@ -139,32 +139,32 @@ public class PaddleMobileUnitTest {
 //            self.tensorPrint(tensor: tx, dim: [4, 6])
 //        }
-        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let input: [Float32] = (0..<24).map { Float32($0) }
-        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
+//        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
-        let mp = ReshapeMetalParam.init(
+//        let mp = ReshapeMetalParam.init(
-            idim: (1, 2, 3, 4),
+//            idim: (1, 2, 3, 4),
-            itrans: (0, 1, 2, 3),
+//            itrans: (0, 1, 2, 3),
-            odim: (1, 1, 1, 24),
+//            odim: (1, 1, 1, 24),
-            otrans: (0, 1, 2, 3)
+//            otrans: (0, 1, 2, 3)
-        )
+//        )
-        let param = ReshapeTestParam.init(
+//        let param = ReshapeTestParam.init(
-            inputTexture: inTexture,
+//            inputTexture: inTexture,
-            outputTexture: outTexture,
+//            outputTexture: outTexture,
-            param: mp
+//            param: mp
-        )
+//        )
-        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
-        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
-        buffer.addCompletedHandler { (buffer) in
+//        buffer.addCompletedHandler { (buffer) in
-            let _: Float32? = inTexture.logDesc()
+//            let _: Float32? = inTexture.logDesc()
-            let _: Float32? = outTexture.logDesc()
+//            let _: Float32? = outTexture.logDesc()
-            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
-            self.tensorPrint(tensor: tx, dim: [24])
+//            self.tensorPrint(tensor: tx, dim: [24])
-        }
+//        }
+//
+//        
-        buffer.commit()
+//        buffer.commit()
    }
    public func testTranspose() {
@@ -195,23 +195,23 @@ public class PaddleMobileUnitTest {
 //            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
 //            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
 //        }
+//
-        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let input: [Float32] = (0..<24).map { Float32($0) }
-        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
-        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
+//        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
-        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
-        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//
-        transposeKernel.test(commandBuffer: buffer, param: param)
+//        transposeKernel.test(commandBuffer: buffer, param: param)
+//
-        buffer.addCompletedHandler { (buffer) in
+//        buffer.addCompletedHandler { (buffer) in
-            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
-            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
-            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
-            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
-            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
+//            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
-        }
+//        }
+//        
        buffer.commit()
    }

--- a/metal/paddle-mobile/paddle-mobile/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
@@ -243,7 +243,7 @@ extension Tensor: Variant {
 extension Texture: Variant {
 }
-extension ResultHolder: Variant {
+extension GPUResultHolder: Variant {
 }
 extension InputTexture: Variant {
@@ -252,3 +252,43 @@ extension InputTexture: Variant {
 extension MTLTexture where Self: Variant {
 }
+class FetchHolder: Variant {
+  var resultBuffer: MTLBuffer?
+  var dim: [Int]
+  var capacity: Int
+  init(inCapacity: Int, inDim: [Int]) {
+    capacity = inCapacity
+    dim = inDim
+  }
+  func initBuffer(device: MTLDevice) {
+    resultBuffer = device.makeBuffer(length: capacity * 4, options: [])
+  }
+  var result: UnsafeMutablePointer<Float32> {
+    guard let inResultBuffer = resultBuffer else {
+      fatalError()
+    }
+    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: capacity)
+  }
+}
+extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    fatalError()
+//    return "\(result)"
+  }
+  var debugDescription: String {
+    fatalError()
+//    return "\(result)"
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Genet.swift
+++ b/metal/paddle-mobile/paddle-mobile/Genet.swift
@@ -46,8 +46,9 @@ public class Genet: Net {
    }
  }
-  override  public func resultStr(res: [Float]) -> String {
+  override  public func resultStr(res: ResultHolder) -> String {
-    return " \(Array<Float>(res.suffix(10))) ... "
+//    fatalError()
+    return " \(res.result![0]) ... "
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/MobileNet.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
@@ -42,9 +42,12 @@ class MobileNet: Net{
  let labels = PreWords.init(fileName: "synset")
-  override public func resultStr(res: [Float]) -> String {
+  override public func resultStr(res: ResultHolder) -> String {
+    guard let resPointer = res.result else {
+      fatalError()
+    }
    var s: [String] = []
-    res.top(r: 5).enumerated().forEach{
+    (0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
    }
    return s.joined(separator: "\n")

--- a/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
@@ -46,51 +46,52 @@ public class MobileNet_ssd_hand: Net{
    }
  }
-  override public func resultStr(res: [Float]) -> String {
+  override public func resultStr(res: ResultHolder) -> String {
    return " \(res)"
  }
-  override func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
-    guard let interRes = paddleMobileRes.intermediateResults else {
+//    guard let interRes = paddleMobileRes.intermediateResults else {
-      fatalError(" need have inter result ")
+//      fatalError(" need have inter result ")
-    }
+//    }
-    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
-      fatalError(" need score ")
-    }
-    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
-      fatalError()
-    }
-    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
-//    print("score: ")
-//    print(scoreFormatArr.strideArray())
 //
-    var bboxArr = bbox.metalTexture.float32Array()
+//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
-//    print("bbox: ")
+//      fatalError(" need score ")
-//    print(bboxArr.strideArray())
+//    }
+//
-    let nmsCompute = NMSCompute.init()
+//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
-    nmsCompute.scoreThredshold = 0.01
+//      fatalError()
-    nmsCompute.nmsTopK = 400
+//    }
-    nmsCompute.keepTopK = 200
+//
-    nmsCompute.nmsEta = 1.0
+//    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
-    nmsCompute.nmsThreshold = 0.45
+////    print("score: ")
-    nmsCompute.background_label = 0;
+////    print(scoreFormatArr.strideArray())
+////
-    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+//    var bboxArr = bbox.metalTexture.float32Array()
+////    print("bbox: ")
-    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+////    print(bboxArr.strideArray())
-    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+//
-      fatalError( " result error " )
+//    let nmsCompute = NMSCompute.init()
-    }
+//    nmsCompute.scoreThredshold = 0.01
+//    nmsCompute.nmsTopK = 400
-    let output: [Float32] = result.map { $0.floatValue }
+//    nmsCompute.keepTopK = 200
+//    nmsCompute.nmsEta = 1.0
+//    nmsCompute.nmsThreshold = 0.45
-    return output
+//    nmsCompute.background_label = 0;
+//
+//    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+//
+//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+//    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+//      fatalError( " result error " )
+//    }
+//
+//    let output: [Float32] = result.map { $0.floatValue }
+//
+//
+//    return output
+    fatalError()
  }

--- a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
@@ -30,50 +30,112 @@ public class MobileNet_ssd_AR: Net{
  class MobilenetssdPreProccess: CusomKernel {
    init(device: MTLDevice) {
      let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilent_ar_preprocess_half", outputDim: s, usePaddleMobileLib: false)
+      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
    }
  }
-  override public func resultStr(res: [Float]) -> String {
+  override public func resultStr(res: ResultHolder) -> String {
-    return " \(res)"
+    return " \(res.result![0])"
  }
-  override func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
    guard let interRes = paddleMobileRes.intermediateResults else {
      fatalError(" need have inter result ")
    }
-    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
      fatalError(" need score ")
    }
-    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
      fatalError()
    }
-    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
+//    let startDate = Date.init()
-    //    print("score: ")
-    //    print(scoreFormatArr.strideArray())
+//    print("scoreFormatArr: ")
-    //
+//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
-    var bboxArr = bbox.metalTexture.float32Array()
+//
-    //    print("bbox: ")
+//    print("bbox arr: ")
-    //    print(bboxArr.strideArray())
+//
+//    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
    let nmsCompute = NMSCompute.init()
-    nmsCompute.scoreThredshold = 0.01
+    nmsCompute.scoreThredshold = 0.25
-    nmsCompute.nmsTopK = 400
+    nmsCompute.nmsTopK = 100
-    nmsCompute.keepTopK = 200
+    nmsCompute.keepTopK = 100
    nmsCompute.nmsEta = 1.0
-    nmsCompute.nmsThreshold = 0.45
+    nmsCompute.nmsThreshold = 0.449999988
    nmsCompute.background_label = 0;
-    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
-    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
-    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
      fatalError( " result error " )
    }
+    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+//    for i in 0..<Int(result.outputSize) {
+//
+//      print("i \(i) : \(result.output[i])")
+//    }
+//    print(Date.init().timeIntervalSince(startDate))
-    let output: [Float32] = result.map { $0.floatValue }
+//    print(resultHolder.result![0])
-    return output
+    return resultHolder
  }
+  override func updateProgram(program: Program) {
+    for i in [56, 66, 76, 86, 93, 99] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
+      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
+      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
+      program.scope[output] = originTexture
+      if i == 99 {
+        opDesc.attrs["axis"] = 0
+      } else {
+        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+      }
+    }
+    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    }
+    for i in [60, 101, 90, 97, 70, 80] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    }
+    for i in [102] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      for output in opDesc.outputs["Out"]! {
+        let v = program.scope[output]!
+        let originTexture = v as! Texture<Float32>
+        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      }
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+      print(" split axis \(opDesc.attrs["axis"])")
+    }
+    // 99
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Net.swift
+++ b/metal/paddle-mobile/paddle-mobile/Net.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+public class ResultHolder: NSObject {
+  @objc public let result: UnsafeMutablePointer<Float32>?
+  @objc public let capacity: Int
+  init(inResult: UnsafeMutablePointer<Float32>?, inCapacity: Int) {
+    result = inResult
+    capacity = inCapacity
+  }
+  public func releasePointer() {
+    result?.deinitialize(count: capacity)
+    result?.deallocate()
+  }
+}
+public class Net: NSObject {
+  var except: Int = 0
+  var means: [Float] = []
+  var scale: Float = 0.0
+  var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
+  var preprocessKernel: CusomKernel? = nil
+  var paramPointer: UnsafeMutableRawPointer? = nil
+  var paramSize: Int = 0
+  var modelPointer: UnsafeMutableRawPointer? = nil
+  var modelSize: Int = 0
+  var modelPath: String = ""
+  var paramPath: String = ""
+  var modelDir: String = ""
+  @objc public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+      self.paramPointer = paramPointer
+      self.paramSize = paramSize
+      self.modelPointer = modePointer
+      self.modelSize = modelSize
+      super.init()
+  }
+  public func resultStr(res: ResultHolder) -> String {
+    fatalError()
+  }
+  func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    return ResultHolder.init(inResult: paddleMobileRes.resultPointer, inCapacity: paddleMobileRes.capacity)
+  }
+  @objc public init(device: MTLDevice) {
+    super.init()
+  }
+  func updateProgram(program: Program) {
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -64,7 +64,8 @@ class OpCreator<P: PrecisionType> {
         gBilinearInterpType        :     BilinearInterpOp<P>.creat,
         gSplit                     :     SplitOp<P>.creat,
         gShape                     :     ShapeOp<P>.creat,
-         gFlatten                   :     FlattenOp<P>.creat]
+         gFlatten                   :     FlattenOp<P>.creat,
+         gConvAddPreluType          :     ConvAddPreluOp<P>.creat]
    private init(){}
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -19,6 +19,12 @@ protocol Fusion {
  static func fusionNode() -> Node
  static func change() -> [String : [(from: String, to: String)]]
  static func fusionType() -> String
+  static func needCheck() -> [(Int, String)]
+}
+extension Fusion {
+  static func needCheck() -> [(Int, String)] {
+    return []
+  }
 }
 protocol Runable {
@@ -26,6 +32,7 @@ protocol Runable {
  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
  func delogOutput()
  func inputVariant() -> [String : [Variant]]
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
 }
 extension Runable where Self: OperatorProtocol{
@@ -38,11 +45,16 @@ extension Runable where Self: OperatorProtocol{
  }
  func inputVariant() -> [String : [Variant]] {
-    return [:]
+//    return [:]
-//    fatalError(" op \(type) need implement inputVariant")
+    fatalError(" op \(type) need implement inputVariant")
+  }
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    fatalError(" need implement ")
  }
  func delogOutput() {
    print(type + ": has no implementation" )
  }
 }
@@ -144,6 +156,7 @@ let gBilinearInterpType         = "bilinear_interp"
 let gSplit                      = "split"
 let gShape                      = "shape"
 let gFlatten                    = "flatten"
+let gConvAddPreluType           = "conv_add_prelu"
 let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
               gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
@@ -169,5 +182,7 @@ let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Out
               gBilinearInterpType          : (inputs: ["X"], outputs: ["Out"]),
               gSplit                       : (inputs: ["X"], outputs: ["Out"]),
               gShape                       : (inputs: ["Input"], outputs: ["Out"]),
-               gFlatten                     : (inputs: ["X"], outputs: ["Out"])
+               gFlatten                     : (inputs: ["X"], outputs: ["Out"]),
+               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"])
              ]
--- a/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
@@ -19,11 +19,14 @@ class BatchNormParam<P: PrecisionType>: OpParam {
  required init(opDesc: OpDesc, inScope: Scope) throws {
    do {
      input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
+      if input.transpose != [0, 2, 3, 1] {
+        fatalError("batch norm only accepts NHWC")
+      }
      output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
-      inputBias = try BatchNormParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+      bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
-      inputMean = try BatchNormParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+      mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
-      inputScale = try BatchNormParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+      scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
-      inputVariance = try BatchNormParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+      variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
      epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
      momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
    } catch let error {
@@ -32,10 +35,10 @@ class BatchNormParam<P: PrecisionType>: OpParam {
  }
  let input: Texture<P>
  var output: Texture<P>
-  let inputBias: Tensor<ParamPrecisionType>
+  let bias: Tensor<P>
-  let inputMean: Tensor<ParamPrecisionType>
+  let mean: Tensor<P>
-  let inputScale: Tensor<ParamPrecisionType>
+  let scale: Tensor<P>
-  let inputVariance: Tensor<ParamPrecisionType>
+  let variance: Tensor<P>
  let epsilon: Float
  let momentum: Float
 }
@@ -53,9 +56,11 @@ class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam
      throw error
    }
  }
-}
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
@@ -19,15 +19,15 @@ class BilinearInterpParam<P: PrecisionType>: OpParam {
  required init(opDesc: OpDesc, inScope: Scope) throws {
    do {
      input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
-//      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
-//        fatalError()
-//      }
      output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
      out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
      out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
    } catch let error {
      throw error
    }
+    if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+      fatalError()
+    }
  }
  let input: Texture<P>
  var output: Texture<P>
@@ -53,6 +53,10 @@ class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, Bili
  func delogOutput() {
    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+//    print(outputArray)
+    print(outputArray.strideArray())
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -27,6 +27,10 @@ class BoxcoderParam<P: PrecisionType>: OpParam {
    } catch let error {
      throw error
    }
+    assert(priorBox.tensorDim.cout() == 2)
+    assert(priorBoxVar.tensorDim.cout() == 2)
+    assert(targetBox.tensorDim.cout() == 3)
+    assert(output.tensorDim.cout() == 3)
    assert(priorBox.transpose == [0, 1, 2, 3])
    assert(priorBoxVar.transpose == [0, 1, 2, 3])
    assert(targetBox.transpose == [0, 1, 2, 3])
@@ -59,30 +63,19 @@ class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>
  func delogOutput() {
    print(" \(type) output: ")
-//    let priorBoxpadToFourDim = para.priorBox.padToFourDim
+    let device = para.output.metalTexture!.device
-//    let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxpadToFourDim[0], h: priorBoxpadToFourDim[1], w: priorBoxpadToFourDim[2], c: priorBoxpadToFourDim[3]))
+    let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
-//    print(" prior box ")
+    let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
-//    print(priorBoxArray.strideArray())
+    let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
-//
+    let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-//    let priorBoxVarpadToFourDim = para.priorBoxVar.padToFourDim
+    print(" prior box var ")
-//    let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarpadToFourDim[0], h: priorBoxVarpadToFourDim[1], w: priorBoxVarpadToFourDim[2], c: priorBoxVarpadToFourDim[3]))
+    print(pbv.strideArray())
-//    print(" prior box var ")
-//    print(priorBoxVarArray.strideArray())
-//
-//    let targetBoxpadToFourDim = para.targetBox.padToFourDim
-//    let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
-//    print(" target box ")
-//    print(targetBoxArray.strideArray())
-    let targetBoxpadToFourDim = para.targetBox.padToFourDim
-    let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
    print(" target box ")
-    print(targetBoxArray.strideArray())
+    print(tb.strideArray())
+    print(" prior box ")
-    let padToFourDim = para.output.padToFourDim
+    print(pb.strideArray())
-    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
    print(" output ")
-    print(outputArray.strideArray())
+    print(out.strideArray())
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -65,15 +65,10 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run
  func delogOutput() {
    print(" \(type) output: ")
-    let padToFourDim = para.output.padToFourDim
-    if para.output.transpose == [0, 1, 2, 3] {
+    let device = para.output.metalTexture!.device
-      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
    print(outputArray.strideArray())
-    } else if para.output.transpose == [0, 2, 3, 1] {
-      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
-    } else {
-      fatalError(" not implemet")
-    }
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+class ConvAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddPreluOp<P>
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  static func fusionType() -> String {
+    return gConvAddPreluType
+  }
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
@@ -15,14 +15,15 @@
 import Foundation
 class FetchParam<P: PrecisionType>: OpParam{
-  var output: Texture<P>
+  var output: FetchHolder
  let input: Texture<P>
  let scope: Scope
  required init(opDesc: OpDesc, inScope: Scope) throws {
    scope = inScope
    do {
      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = input
+      output = FetchHolder.init(inCapacity: input.numel(), inDim: input.tensorDim.dims)
+      scope.setOutput(output: output)
    } catch let error {
      throw error
    }
@@ -34,14 +35,40 @@ class FetchParam<P: PrecisionType>: OpParam{
 class FetchKernel<P: PrecisionType>: Kernel, Computable {
  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
  }
  required init(device: MTLDevice, param: FetchParam<P>) {
-    super.init(device: device, inFunctionName: "place_holder")
+    param.output.initBuffer(device: device)
+    if computePrecision == .Float16 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch_half")
+      } else {
+//        fatalError(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder_half")
+        print(" not support ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch")
+      } else {
+        print(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder")
+//        fatalError(" not support ")        
+      }
+    } else {
+      fatalError(" not support ")
+    }
  }
 }
-class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable{
+class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
  typealias OpType = FetchOp<P>
@@ -50,7 +77,11 @@ class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runab
  }
  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    scope.setOutput(output: para.output)
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
@@ -14,7 +14,24 @@
 import Foundation
-class FlattenOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
+class FlattenParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let axis: Int
+}
+class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
  typealias OpType = FlattenOp<P>
@@ -32,6 +49,9 @@ class FlattenOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
  func delogOutput() {
    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
@@ -15,20 +15,21 @@
 import Foundation
 class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
-//  var newScale: MTLBuffer
-//  var newBias: MTLBuffer
-//
  required init(device: MTLDevice, param: BatchNormParam<P>) {
-//    guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
+    let count = param.variance.dim.numel()
-//      fatalError()
+    let varianceP = param.variance.data.pointer
-//    }
+    let meanP = param.mean.data.pointer
-//
+    let scaleP = param.scale.data.pointer
-//    guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
+    let biasP = param.bias.data.pointer
-//      fatalError()
+    for i in 0..<count {
-//    }
+      let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
-//    self.newScale = newScale
+      biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
-//    self.newBias = newBias
+      scaleP[i] = invStd * scaleP[i]
-//
+    }
+    param.bias.initBuffer(device: device, precision: computePrecision)
+    param.scale.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
    if computePrecision == .Float32 {
      super.init(device: device, inFunctionName: "batchnorm")
    } else if computePrecision == .Float16 {
@@ -36,37 +37,16 @@ class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
    } else {
      fatalError()
    }
-//
-//    let varianceBuffer : MTLBuffer = param.inputVariance.buffer
-//
-//    var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
-//    let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
-//    for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
-//      invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
-//    }
-//
-//    let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
-//    let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
-//    let scale : MTLBuffer = param.inputScale.buffer
-//    let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
-//    let bias : MTLBuffer = param.inputBias.buffer
-//    let biasContents = bias.contents().assumingMemoryBound(to: P.self)
-//    let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
-//
-//    for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
-//      newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
-//      newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
-//    }
  }
  func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
      throw PaddleMobileError.predictError(message: " encoder is nil")
    }
-//    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.input.metalTexture, index: 0)
-//    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 1)
-//    encoder.setBuffer(newScale, offset: 0, index: 0)
+    encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
-//    encoder.setBuffer(newBias, offset: 0, index: 1)
+    encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
@@ -27,10 +27,16 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
    encoder.setTexture(param.input.metalTexture, index: 0)
    encoder.setTexture(param.output.metalTexture, index: 1)
-    let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
+    var ratio_h: Float32 = 0
-    let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
+    var ratio_w: Float32 = 0
+    if param.output.tensorDim.dims[2] > 1 {
+      ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+    }
+    if param.output.tensorDim.dims[3] > 1 {
+      ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
+    }
    var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
-    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }
@@ -38,7 +44,7 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
  required init(device: MTLDevice, param: BilinearInterpParam<P>) {
    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "bilinear_interp")
+      super.init(device: device, inFunctionName: "bilinear_interp_float")
    } else if computePrecision == .Float16 {
      super.init(device: device, inFunctionName: "bilinear_interp_half")
    } else {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -33,9 +33,9 @@ class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
  }
  required init(device: MTLDevice, param: BoxcoderParam<P>) {
-    param.output.initTexture(device: device, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: computePrecision)
    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "boxcoder")
+      super.init(device: device, inFunctionName: "boxcoder_float")
    } else if computePrecision == .Float16 {
      super.init(device: device, inFunctionName: "boxcoder_half")
    } else {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
@@ -31,101 +31,111 @@ struct ConcatMetalParam {
 }
 class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+  var v = "normal"
+  var pm = ConcatMetalParam.init()
+  func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
-  func encodeTest(_ cmdBuffer: MTLCommandBuffer, _ param: ConcatTestParam, _ istart: Int, _ iend: Int) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-    let encoder = cmdBuffer.makeComputeCommandEncoder()!
+      throw PaddleMobileError.predictError(message: " encode is nil")
-    var p = ConcatMetalParam.init()
+    }
-    var odim: [Int32] = [1, 1, 1, 1]
+    let num = param.input.count
-    for i in 0..<param.odim.count {
+    for i in 0..<num {
-      odim[4-param.odim.count+i] = Int32(param.odim[i])
+      encoder.setTexture(param.input[i].metalTexture, index: i)
    }
-    p.odim = (odim[0], odim[1], odim[2], odim[3])
+    encoder.setTexture(param.output.metalTexture, index: num)
-    p.axis = Int32(4 - param.odim.count + param.axis)
+    if v == "normal" {
-    for i in 0..<istart {
+      encoder.setTexture(param.output.metalTexture, index: num + 1)
-      p.offset += Int32(param.dims[i][param.axis])
+    }
-    }
+    encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-    var vdim: [Int32] = []
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    for i in 0..<(iend - istart) {
-      encoder.setTexture(param.input[i+istart], index: i)
-      vdim.append(Int32(param.dims[i+istart][Int(param.axis)]))
-    }
-    for i in (iend-istart)..<6 {
-      encoder.setTexture(param.input[0], index: i)
-      vdim.append(0)
-    }
-    p.vdim = (vdim[0], vdim[1], vdim[2], vdim[3], vdim[4], vdim[5])
-    encoder.setTexture(param.output, index: 6)
-    encoder.setTexture(param.output, index: 7)
-    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output)
    encoder.endEncoding()
  }
-  func encode(_ cmdBuffer: MTLCommandBuffer, _ param: ConcatParam<P>, _ istart: Int, _ iend: Int) throws {
+  required init(device: MTLDevice, param: ConcatParam<P>) {
-    guard let encoder = cmdBuffer.makeComputeCommandEncoder() else {
+    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
-      throw PaddleMobileError.predictError(message: " encode is nil")
+    let orank = param.output.tensorDim.cout()
-    }
+    let num = param.input.count
-    var p = ConcatMetalParam.init()
+    assert(num <= 6)
-    let odim = (0..<4).map { Int32(param.output.dim[$0]) }
+    var axis = 4 - param.output.tensorDim.cout() + param.axis
-    p.odim = (odim[0], odim[1], odim[2], odim[3])
-    p.axis = Int32(4 - param.output.tensorDim.cout() + param.axis)
    for i in 0..<4 {
-      if Int32(param.transpose[i]) == p.axis {
+      if param.transpose[i] == axis {
-        p.axis = Int32(i)
+        axis = i
        break
      }
    }
-    for i in 0..<istart {
+    pm.axis = Int32(axis)
-      p.offset += Int32(param.input[i+istart].dim[Int(p.axis)])
+    pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
+    pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
+    var vdim: [Int] = [0, 0, 0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = param.input[i].dim[axis]
    }
-    var vdim: [Int32] = []
+    if orank == 4 {
-    for i in 0..<(iend - istart) {
+      if axis == 1 {
-      encoder.setTexture(param.input[i+istart].metalTexture, index: i)
+        v = "y"
-      vdim.append(Int32(param.input[i+istart].dim[Int(p.axis)]))
+      } else if axis == 2 {
+        v = "x"
+      } else {
+        if (param.output.dim[0] == 1) && axis == 3 {
+          var vz = true
+          for i in 0..<num {
+            if vdim[i] % 4 != 0 {
+              vz = false
+              break
            }
-    for i in (iend-istart)..<6 {
-      encoder.setTexture(param.input[0].metalTexture, index: i)
-      vdim.append(0)
          }
-    p.trans = (Int32(param.transpose[0]), Int32(param.transpose[1]), Int32(param.transpose[2]), Int32(param.transpose[3]))
+          if vz {
+            v = "z"
-    p.vdim = (vdim[0], vdim[1], vdim[2], vdim[3], vdim[4], vdim[5])
+            for i in 0..<num {
-    encoder.setTexture(param.output.metalTexture, index: 6)
+              vdim[i] = vdim[i] / 4
-    encoder.setTexture(param.output.metalTexture, index: 7)
-    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-    encoder.endEncoding()
            }
-  func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
-    let group = param.input.count / 6
-    let remain = param.input.count % 6
-    for i in 0..<group {
-      try self.encode(commandBuffer, param, 6 * i, 6 * (i + 1))
          }
-    if remain > 0 {
-      try self.encode(commandBuffer, param, 6 * group, param.input.count)
        }
      }
+    } else if orank == 3 {
-  func test(cmdBuffer: MTLCommandBuffer, param: ConcatTestParam) {
+      if axis == 2 {
-    let group = param.input.count / 6
+        v = "y"
-    let remain = param.input.count % 6
+      } else if axis == 3 {
-    for i in 0..<group {
+        v = "x"
-      self.encodeTest(cmdBuffer, param, 6 * i, 6 * (i + 1))
+      } else if axis == 1 {
+        var vz = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vz = false
+            break
          }
-    if remain > 0 {
-      self.encodeTest(cmdBuffer, param, 6 * group, param.input.count)
        }
+        if vz {
+          v = "z"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
          }
+        }
-  required init(device: MTLDevice, param: ConcatParam<P>) {
+      }
-    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
+    } else {
+      if axis == 2 {
+        v = "y"
+      } else if axis == 3 {
+        var vx = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vx = false
+            break
+          }
+        }
+        if vx {
+          v = "x"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
+          }
+        }
+      }
+    }
+    pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "concat")
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float")
    } else if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "concat_half")
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half")
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    metalParam = inMetalParam
+  }
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      throw PaddleMobileError.predictError(message: " encode is nil")
+//    }
+//    
+//    encoder.setTexture(param.input.metalTexture, index: 0)
+//    encoder.setTexture(param.output.metalTexture, index: 1)
+//    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+//    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+//    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+//    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+//    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+//    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+struct FlattenMetalParam {
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
+}
+class FlattenKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: FlattenMetalParam
+  required init(device: MTLDevice, param: FlattenParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    }
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = FlattenMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
+    assert(orank == 2)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half")
+    } else {
+      fatalError()
+    }
+  }
+  func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
@@ -15,11 +15,41 @@
 import Foundation
 class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
+  let pipline1: MTLComputePipelineState
  required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
-    super.init(device: device, inFunctionName: "place_holder")
+    param.middleOutput.initBuffer(device: device)
+    param.bboxOutput.initBuffer(device: device)
+    if computePrecision == .Float32 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result")
+    } else if computePrecision == .Float16 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result_half")
+    } else {
+      fatalError( " unsupport precision " )
+    }
  }
  func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.scores.metalTexture, index: 0)
+    encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
+    encoder.endEncoding()
+    guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
+    encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
+    encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
+    encoderBox.endEncoding()
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -34,24 +34,44 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
  required init(device: MTLDevice, param: PriorBoxParam<P>) {
-    param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+    let originDim = param.output.tensorDim;
+    param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
    if computePrecision == .Float32 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder")
+      } else {
        super.init(device: device, inFunctionName: "prior_box")
+      }
    } else if computePrecision == .Float16 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half")
+      } else {
        super.init(device: device, inFunctionName: "prior_box_half")
+      }
    } else {
      fatalError()
    }
-    let n = 1
-    let h = param.output.dim[1]
-    let w = param.output.dim[2]
-    let c = param.output.dim[3] * param.output.dim[0]
-    param.output.dim = Dim.init(inDim: [n, h, w, c])
+    guard param.minSizes.count == 1 else {
-    param.output.transpose = [0, 1, 2, 3]
+      fatalError(" need implement ")
+    }
+//    let n = 1
+//    let h = param.output.dim[1]
+//    let w = param.output.dim[2]
+//    let c = param.output.dim[3] * param.output.dim[0]
+//
+//    param.output.dim = Dim.init(inDim: [n, h, w, c])
+//    param.output.transpose = [0, 1, 2, 3]
    let imageWidth = Float32(param.inputImage.padToFourDim[3])
    let imageHeight = Float32(param.inputImage.padToFourDim[2])

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -49,10 +49,12 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
      odim: (od[0], od[1], od[2], od[3]),
      otrans: (ot[0], ot[1], ot[2], ot[3])
    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "reshape")
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float")
    } else if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "reshape_half")
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half")
    } else {
      fatalError()
    }
@@ -81,15 +83,15 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+//  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      fatalError()
+//      fatalError()
-    }
+//    }
-    encoder.setTexture(testParam.inputTexture, index: 0)
+//    encoder.setTexture(testParam.inputTexture, index: 0)
-    encoder.setTexture(testParam.outputTexture, index: 1)
+//    encoder.setTexture(testParam.outputTexture, index: 1)
-    var pm: ReshapeMetalParam = testParam.param
+//    var pm: ReshapeMetalParam = testParam.param
-    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+//    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+//    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
-    encoder.endEncoding()
+//    encoder.endEncoding()
-  }
+//  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
@@ -19,19 +19,20 @@ struct ShapeMetalParam {
 class ShapeKernel<P: PrecisionType>: Kernel, Computable{
  func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//    print("shape compute")
-      throw PaddleMobileError.predictError(message: " encode is nil")
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-    }
+//      throw PaddleMobileError.predictError(message: " encode is nil")
-    encoder.setTexture(param.output.metalTexture, index: 0)
+//    }
-    encoder.endEncoding()
+//    encoder.setTexture(param.output.metalTexture, index: 0)
+//    encoder.endEncoding()
  }
  required init(device: MTLDevice, param: ShapeParam<P>) {
    param.output.initTexture(device: device, computePrecision: computePrecision)
    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "split")
+      super.init(device: device, inFunctionName: "shape")
    } else if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "split_half")
+      super.init(device: device, inFunctionName: "shape_half")
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -29,7 +29,7 @@ class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
      K: Int32(param.input.tensorDim[1])
    )
    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "softmax")
+      super.init(device: device, inFunctionName: "softmax_float")
    } else if computePrecision == .Float16 {
      super.init(device: device, inFunctionName: "softmax_half")
    } else {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
@@ -15,23 +15,76 @@
 import Foundation
 struct SplitMetalParam {
+  var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+  var axis: Int32 = 0
+  var offset: Int32 = 0
+  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
 }
 class SplitKernel<P: PrecisionType>: Kernel, Computable{
+  var smp: SplitMetalParam
  func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
      throw PaddleMobileError.predictError(message: " encode is nil")
    }
-    encoder.setTexture(param.output.metalTexture, index: 0)
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    for i in 0..<param.outputList.count {
+      encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+    }
+    encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
    encoder.endEncoding()
  }
  required init(device: MTLDevice, param: SplitParam<P>) {
-    param.output.initTexture(device: device, computePrecision: computePrecision)
+    //     param.output.initTexture(device: device, computePrecision: computePrecision)
+    let num = param.outputList.count
+    let rank = param.input.tensorDim.cout()
+    assert(num >= 2 && num <= 4)
+    for output in param.outputList {
+      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    }
+    smp = SplitMetalParam.init()
+    smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
+    smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
+    for i in 0..<4 {
+      if param.input.transpose[i] == smp.axis {
+        smp.axis = Int32(i)
+        break
+      }
+    }
+    smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
+    var vdim: [Int32] = [0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
+    }
+    smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
+    var v = "normal"
+    if rank == 4 {
+      if smp.axis == 1 {
+        v = "y"
+      } else if smp.axis == 2 {
+        v = "x"
+      }
+    } else if rank == 3 {
+      if smp.axis == 2 {
+        v = "y"
+      } else if smp.axis == 3 {
+        v = "x"
+      }
+    } else if rank == 2 {
+      if smp.axis == 2 {
+        v = "y"
+      }
+    }
+    if v == "normal" {
+      fatalError("split unsupported")
+    }
    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "split")
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float")
    } else if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "split_half")
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half")
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -17,73 +17,52 @@ import Foundation
 struct TransposeMetalParam {
  var iC: Int32 = 0
  var oC: Int32 = 0
-  var i0: Int32
+  var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
-  var i1: Int32
-  var i2: Int32
-  var i3: Int32
-  init(_ i0: Int32, _ i1: Int32, _ i2: Int32, _ i3: Int32) {
-    self.i0 = i0
-    self.i1 = i1
-    self.i2 = i2
-    self.i3 = i3
-  }
-  init(_ axis: [Int]) {
-    self.init(Int32(axis[0]), Int32(axis[1]), Int32(axis[2]), Int32(axis[3]))
-  }
-}
-struct TransposeTestParam: TestParam {
-  let inputTexture: MTLTexture
-  let outputTexture: MTLTexture
-  let iC: Int
-  let oC: Int
-  let axis: [Int]
 }
-class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
+class TransposeKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: TransposeMetalParam = TransposeMetalParam.init()
  required init(device: MTLDevice, param: TransposeParam<P>) {
-    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    let rank = param.input.tensorDim.cout()
-    if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "transpose_half")
-    } else if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "transpose")
-    } else {
-      fatalError()
-    }
-    var invT: [Int] = [0, 1, 2, 3]
-    for (i, v) in param.input.transpose.enumerated() {
-      invT[v] = i
-    }
    var axis: [Int] = [0, 1, 2, 3]
    for i in 0..<param.axis.count {
-      axis[4-param.axis.count+i] = 4 - param.axis.count + Int(param.axis[i])
+      axis[4-rank+i] = 4 - rank + Int(param.axis[i])
    }
-    let realAxis = axis.map {invT[$0]}
-    var tmp = TransposeMetalParam.init(realAxis)
+    var naxis: [Int] = [0, 0, 0, 0]
-    tmp.iC = Int32(param.input.dim[param.input.transpose[3]])
+    for i in 0..<4 {
-    tmp.oC = Int32(param.output.dim[3])
+      for j in 0..<4 {
-    if realAxis == [0, 1, 2, 3] {
+        if param.input.transpose[j] == axis[i] {
-//      print("====> transpose! FAST :)")
+          naxis[i] = j
-    } else {
+          break
-//      print("====> transpose! SLOW :(")
        }
-    metalParam = tmp
      }
+    }
-  required init(device: MTLDevice, testParam: TransposeTestParam) {
+    metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
+    metalParam.oC = Int32(param.output.dim[3])
+    metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
+    var kernelFunc = "transpose_undefined"
    if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "transpose_half")
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_half"
+      } else {
+        kernelFunc = "transpose_\(rank)_half"
+      }
    } else if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "transpose")
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_float"
+      } else {
+        kernelFunc = "transpose_\(rank)_float"
+      }
    } else {
      fatalError()
    }
+    print("===========>", kernelFunc)
+    print(metalParam)
+    super.init(device: device, inFunctionName: kernelFunc)
  }
-  var metalParam: TransposeMetalParam!
  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
      throw PaddleMobileError.predictError(message: " encode is nil")
@@ -97,18 +76,4 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
  }
-  public func test(commandBuffer: MTLCommandBuffer, param: TransposeTestParam) {
+}
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      fatalError()
-    }
-    encoder.setTexture(param.inputTexture, index: 0)
-    encoder.setTexture(param.outputTexture, index: 1)
-    var tmp = TransposeMetalParam.init(param.axis)
-    tmp.iC = Int32(param.iC)
-    tmp.oC = Int32(param.oC)
-    encoder.setBytes(&tmp, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-    encoder.endEncoding()
-  }}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
@@ -15,28 +15,28 @@
 #include <metal_stdlib>
 using namespace metal;
-kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      const device half4 * newScale [[buffer(0)]],
+                      const device float4 * nscale [[buffer(0)]],
-                      const device half4 * newBias [[buffer(1)]],
+                      const device float4 * nbias [[buffer(1)]],
                      uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= outTexture.get_width() ||
      gid.y >= outTexture.get_height() ||
      gid.z >= outTexture.get_array_size()) return;
-  const half4 input = inTexture.read(gid.xy, gid.z);
+  const float4 input = inTexture.read(gid.xy, gid.z);
-  half4 output = input * newScale[gid.z] + newBias[gid.z];
+  float4 output = input * nscale[gid.z] + nbias[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }
-kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
+kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                           const device float4 * newScale [[buffer(0)]],
+                      const device half4 * newScale [[buffer(0)]],
-                           const device float4 * newBias [[buffer(1)]],
+                      const device half4 * newBias [[buffer(1)]],
                      uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= outTexture.get_width() ||
      gid.y >= outTexture.get_height() ||
      gid.z >= outTexture.get_array_size()) return;
-  const float4 input = inTexture.read(gid.xy, gid.z);
+  const half4 input = inTexture.read(gid.xy, gid.z);
-  float4 output = input * newScale[gid.z] + newBias[gid.z];
+  half4 output = input * newScale[gid.z] + newBias[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                     texture2d_array<P, access::write> output [[texture(1)]],
+                     constant bilinear_interp_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    P w = gid.x * pm.ratio_w;
+    P h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    P w1lambda = w - w0, h1lambda = h - h0;
+    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
+    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
+    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
+    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+      + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
@@ -16,60 +16,14 @@
 using namespace metal;
 struct bilinear_interp_param {
-//  int32_t out_h;
-//  int32_t out_w;
  float ratio_h;
  float ratio_w;
 };
-kernel void bilinear_interp(texture2d_array<float, access::read> input [[texture(0)]],
+#define P float
-                     texture2d_array<float, access::write> output [[texture(2)]],
+#include "BilinearInterp.inc.metal"
-                     constant bilinear_interp_param & pm [[buffer(0)]],
+#undef P
-                     uint3 gid [[thread_position_in_grid]]) {
-  float4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    float w = gid.x * pm.ratio_w;
-    float h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    float w1lambda = w - w0, h1lambda = h - h0;
-    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    float4 r0 = input.read(uint2(w0, h0), gid.z);
-    float4 r1 = input.read(uint2(w1, h0), gid.z);
-    float4 r2 = input.read(uint2(w0, h1), gid.z);
-    float4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-}
-kernel void bilinear_interp_half(texture2d_array<half, access::read> input [[texture(0)]],
+#define P half
-                            texture2d_array<half, access::write> output [[texture(2)]],
+#include "BilinearInterp.inc.metal"
-                            constant bilinear_interp_param & pm [[buffer(0)]],
+#undef P
-                            uint3 gid [[thread_position_in_grid]]) {
-  half4 r;
-  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
-    r = input.read(gid.xy, gid.z);
-  } else {
-    half w = gid.x * pm.ratio_w;
-    half h = gid.y * pm.ratio_h;
-    uint w0 = w, h0 = h;
-    uint w1 = w0 + 1, h1 = h0 + 1;
-    half w1lambda = w - w0, h1lambda = h - h0;
-    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
-    if (w1 >= input.get_width()) w1 = w0;
-    if (h1 >= input.get_height()) h1 = h0;
-    half4 r0 = input.read(uint2(w0, h0), gid.z);
-    half4 r1 = input.read(uint2(w1, h0), gid.z);
-    half4 r2 = input.read(uint2(w0, h1), gid.z);
-    half4 r3 = input.read(uint2(w1, h1), gid.z);
-    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
-  }
-  output.write(r, gid.xy, gid.z);
-  output.write(r, gid.xy, gid.z);
-}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
+                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
+                     texture2d_array<P, access::read> targetBox [[texture(2)]],
+                     texture2d_array<P, access::write> output[[texture(3)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) t;
+  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
+  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
+  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
+  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
+  P px = (p.x + p.z) / 2;
+  P py = (p.y + p.w) / 2;
+  P pw = p.z - p.x;
+  P ph = p.w - p.y;
+  P tx = pv.x * t.x * pw + px;
+  P ty = pv.y * t.y * ph + py;
+  P tw = exp(pv.z * t.z) * pw;
+  P th = exp(pv.w * t.w) * ph;
+  VECTOR(P, 4) r;
+  r.x = tx - tw / 2;
+  r.y = ty - th / 2;
+  r.z = tx + tw / 2;
+  r.w = ty + th / 2;
+  output.write(r, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
@@ -15,58 +15,9 @@
 #include <metal_stdlib>
 using namespace metal;
-kernel void boxcoder(texture2d_array<float, access::read> priorBox [[texture(0)]],
+#define P float
-                     texture2d_array<float, access::read> priorBoxVar [[texture(1)]],
+#include "BoxCoder.inc.metal"
-                     texture2d_array<float, access::read> targetBox [[texture(2)]],
+#undef P
-                     texture2d_array<float, access::write> output[[texture(3)]],
+#define P half
-                     uint3 gid [[thread_position_in_grid]]) {
+#include "BoxCoder.inc.metal"
-  float4 t = targetBox.read(gid.xy, gid.z);
+#undef P
-  float4 p = priorBox.read(gid.xy, gid.z);
-  float4 pv = priorBoxVar.read(gid.xy, gid.z);
-  float px = (p.x + p.z) / 2;
-  float py = (p.y + p.w) / 2;
-  float pw = p.z - p.x;
-  float ph = p.w - p.y;
-  float tx = pv.x * t.x * pw + px;
-  float ty = pv.y * t.y * ph + py;
-  float tw = exp(pv.z * t.z) * pw;
-  float th = exp(pv.w * t.w) * ph;
-  float4 r;
-  r.x = tx - tw / 2;
-  r.y = ty - th / 2;
-  r.z = tx + tw / 2;
-  r.w = ty + th / 2;
-  output.write(r, gid.xy, gid.z);
-}
-kernel void boxcoder_half(texture2d_array<half, access::read> priorBox [[texture(0)]],
-                     texture2d_array<half, access::read> priorBoxVar [[texture(1)]],
-                     texture2d_array<half, access::read> targetBox [[texture(2)]],
-                     texture2d_array<half, access::write> output[[texture(3)]],
-                     uint3 gid [[thread_position_in_grid]]) {
-  half4 t = targetBox.read(gid.xy, gid.z);
-  half4 p = priorBox.read(gid.xy, gid.z);
-  half4 pv = priorBoxVar.read(gid.xy, gid.z);
-  float px = (float(p.x) + float(p.z)) / 2;
-  float py = (float(p.y) + float(p.w)) / 2;
-  float pw = float(p.z) - float(p.x);
-  float ph = float(p.w) - float(p.y);
-  float tx = float(pv.x) * float(t.x) * pw + px;
-  float ty = float(pv.y) * float(t.y) * ph + py;
-  float tw = exp(float(pv.z) * float(t.z)) * pw;
-  float th = exp(float(pv.w) * float(t.w)) * ph;
-  float4 r;
-  r.x = tx - tw / 2;
-  r.y = ty - th / 2;
-  r.z = tx + tw / 2;
-  r.w = ty + th / 2;
-  output.write(half4(r), gid.xy, gid.z);
-}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
@@ -15,6 +15,55 @@
 #include <metal_stdlib>
 using namespace metal;
+inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = abcd[2] = 0;
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = 0;
+  abcd[2] = xyzn[1];
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
+  abcd[0] = 0;
+  abcd[3] = xyzn[0];
+  abcd[2] = xyzn[1];
+  abcd[1] = xyzn[2] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
+  xyzn[1] = xyzn[2] = 0;
+  xyzn[0] = abcd[3] / 4;
+  xyzn[1] = abcd[3] % 4;
+}
+inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
+  xyzn[2] = 0;
+  xyzn[1] = abcd[2];
+  xyzn[0] = abcd[3] / 4;
+  xyzn[3] = abcd[3] % 4;
+}
+inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[3];
+  xyzn[1] = abcd[2];
+  xyzn[2] = abcd[1] / 4;
+  xyzn[3] = abcd[1] % 4;
+}
+inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
 inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
  abcd[2] = xyzn[0];
  abcd[1] = xyzn[1];

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-#include <metal_stdlib>
-#include "Common.metal"
-using namespace metal;
-struct ConcatParam {
-  int32_t odim[4];
-  int32_t axis;
-  int32_t offset;
-  int32_t trans[4];
-  int32_t vdim[6];
-};
-kernel void concat(texture2d_array<float, access::read> in0 [[texture(0)]],
-                   texture2d_array<float, access::read> in1 [[texture(1)]],
-                   texture2d_array<float, access::read> in2 [[texture(2)]],
-                   texture2d_array<float, access::read> in3 [[texture(3)]],
-                   texture2d_array<float, access::read> in4 [[texture(4)]],
-                   texture2d_array<float, access::read> in5 [[texture(5)]],
-                   texture2d_array<float, access::read> inx [[texture(6)]],
-                   texture2d_array<float, access::write> out [[texture(7)]],
-                   constant ConcatParam & pm [[buffer(0)]],
-                   uint3 gid [[thread_position_in_grid]]) {
-  ConcatParam cp = pm;
-  int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
-  float4 r;
-  for (int i = 0; i < 4; i++) {
-    xyzn[3] = i;
-    xyzn2abcd(cp.odim[3], xyzn, abcd);
-    int k = abcd[cp.axis] - cp.offset;
-    int j = 0;
-    if (k < 0) {
-      r[i] = inx.read(gid.xy, gid.z)[i];
-    } else {
-      for (; j < 6; j++) {
-        if (k < cp.vdim[j]) {
-          break;
-        }
-        k -= cp.vdim[j];
-      }
-      int ta = cp.odim[cp.axis];
-      abcd[cp.axis] = k;
-      cp.odim[cp.axis] = cp.vdim[j];
-      abcd2xyzn(cp.odim[3], abcd, oxyzn);
-      cp.odim[cp.axis] = ta;
-      switch (j) {
-        case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-      }
-    }
-  }
-  out.write(r, gid.xy, gid.z);
-}
-kernel void concat_half(texture2d_array<half, access::read> in0 [[texture(0)]],
-                   texture2d_array<half, access::read> in1 [[texture(1)]],
-                   texture2d_array<half, access::read> in2 [[texture(2)]],
-                   texture2d_array<half, access::read> in3 [[texture(3)]],
-                   texture2d_array<half, access::read> in4 [[texture(4)]],
-                   texture2d_array<half, access::read> in5 [[texture(5)]],
-                   texture2d_array<half, access::read> inx [[texture(6)]],
-                   texture2d_array<half, access::write> out [[texture(7)]],
-                   constant ConcatParam & pm [[buffer(0)]],
-                   uint3 gid [[thread_position_in_grid]]) {
-  ConcatParam cp = pm;
-  int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
-  half4 r;
-  for (int i = 0; i < 4; i++) {
-    xyzn[3] = i;
-    xyzn2abcd(cp.odim[3], xyzn, abcd);
-    int k = abcd[cp.axis] - cp.offset;
-    int j = 0;
-    if (k < 0) {
-      r[i] = inx.read(gid.xy, gid.z)[i];
-    } else {
-      for (; j < 6; j++) {
-        if (k < cp.vdim[j]) {
-          break;
-        }
-        k -= cp.vdim[j];
-      }
-      int ta = cp.odim[cp.axis];
-      abcd[cp.axis] = k;
-      cp.odim[cp.axis] = cp.vdim[j];
-      abcd2xyzn(cp.odim[3], abcd, oxyzn);
-      cp.odim[cp.axis] = ta;
-      switch (j) {
-        case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-        case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
-      }
-    }
-  }
-  out.write(r, gid.xy, gid.z);
-}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
@@ -17,13 +17,14 @@
 using namespace metal;
-kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+kernel void conv_add_batch_norm_relu_1x1_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
            texture2d_array<half, access::write> outTexture [[texture(1)]],
            constant MetalConvParam &param [[buffer(0)]],
            const device half4 *weights [[buffer(1)]],
            const device half4 *biase [[buffer(2)]],
-                                              const device float4 *new_scale [[buffer(3)]],
+            const device half4 *new_scale [[buffer(3)]],
-                                              const device float4 *new_biase [[buffer(4)]],
+            const device half4 *new_biase [[buffer(4)]],
            uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= outTexture.get_width() ||
@@ -41,7 +42,7 @@ kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::samp
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  half4 output = half4(0.0);
+  float4 output = float4(0.0);
  half4 input;
  for (uint i = 0; i < input_arr_size; ++i) {
@@ -58,18 +59,18 @@ kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::samp
    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
    output.w += dot(input, weight_w);
  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  outTexture.write(half4(output), gid.xy, gid.z);
-  outTexture.write(output, gid.xy, gid.z);
 }
-kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+kernel void conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
            texture2d_array<half, access::write> outTexture [[texture(1)]],
            constant MetalConvParam &param [[buffer(0)]],
            const device half4 *weights [[buffer(1)]],
            const device half4 *biase [[buffer(2)]],
-                                              const device float4 *new_scale [[buffer(3)]],
+            const device half4 *new_scale [[buffer(3)]],
-                                              const device float4 *new_biase [[buffer(4)]],
+            const device half4 *new_biase [[buffer(4)]],
            uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= outTexture.get_width() ||
@@ -86,7 +87,7 @@ kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::samp
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-  half4 output = half4(0.0);
+  float4 output = float4(0.0);
  half4 input[9];
  for (uint i = 0; i < input_arr_size; ++i) {
@@ -113,18 +114,18 @@ kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::samp
      output.w += dot(input[j], weight_w);
    }
  }
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+  outTexture.write(half4(output), gid.xy, gid.z);
 }
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
            texture2d_array<half, access::write> outTexture [[texture(1)]],
            constant MetalConvParam &param [[buffer(0)]],
            const device half *weights [[buffer(1)]],
            const device half4 *biase [[buffer(2)]],
-                                                        const device float4 *new_scale [[buffer(3)]],
+            const device half4 *new_scale [[buffer(3)]],
-                                                        const device float4 *new_biase [[buffer(4)]],
+            const device half4 *new_biase [[buffer(4)]],
            uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= outTexture.get_width() ||
@@ -138,7 +139,7 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, ac
  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
  const uint kernelHXW = 9;
  uint weithTo = gid.z * kernelHXW * 4;
-  half4 output = half4(0.0);
+  float4 output = float4(0.0);
  half4 inputs[9];
  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -156,11 +157,12 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, ac
    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
  }
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
-  outTexture.write(output, gid.xy, gid.z);
+  outTexture.write(half4(output), gid.xy, gid.z);
 }
 /*---------------------------------------------*/

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+kernel void shape() {
+}
+kernel void shape_half() {
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
--- a/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
--- a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
--- a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/Program.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/Program.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
--- a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
--- a/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift