Merge pull request #983 from codeWorm2015/metal

Metal

Merge pull request #983 from codeWorm2015/metal
Metal
e6fed8db · Ruilong Liu · GitHub · 16c3a457 · e4c79d2f · e6fed8db
35 changed file
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
@@ -14,6 +14,8 @@
 		FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8520E11C550081E9F8 /* Main.storyboard */; };
 		FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8820E11C560081E9F8 /* Assets.xcassets */; };
 		FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */; };
+		FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; };
+		FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCC214D27920094B8E5 /* VideoCapture.swift */; };
 		FC8CFEE62135452C0094D569 /* genet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC8CFEE42135452B0094D569 /* genet_params */; };
 		FC8CFEE72135452C0094D569 /* genet_model in Resources */ = {isa = PBXBuildFile; fileRef = FC8CFEE52135452B0094D569 /* genet_model */; };
 		FC8CFEF8213551D10094D569 /* params in Resources */ = {isa = PBXBuildFile; fileRef = FC8CFEF6213551D00094D569 /* params */; };
@@ -61,6 +63,8 @@
 		FC039B8D20E11C560081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "paddle-mobile-demo-Bridging-Header.h"; sourceTree = "<group>"; };
 		FC4FD97B2140EE250073E130 /* libc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libc++.tbd"; path = "usr/lib/libc++.tbd"; sourceTree = SDKROOT; };
+		FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = "<group>"; };
+		FC803BCC214D27920094B8E5 /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = "<group>"; };
 		FC8CFEE42135452B0094D569 /* genet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_params; sourceTree = "<group>"; };
 		FC8CFEE52135452B0094D569 /* genet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_model; sourceTree = "<group>"; };
 		FC8CFEF6213551D00094D569 /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
@@ -132,6 +136,7 @@
 		FC039B8020E11C550081E9F8 /* paddle-mobile-demo */ = {
 			isa = PBXGroup;
 			children = (
+				FC803BCA214D27920094B8E5 /* VideoCapture */,
 				FC8CFED2213519540094D569 /* Net */,
 				FC0E2C2020EDC03B009C1FAC /* models */,
 				FC0E2C1D20EDC030009C1FAC /* images */,
@@ -172,6 +177,15 @@
 			path = ../../models;
 			sourceTree = "<group>";
 		};
+		FC803BCA214D27920094B8E5 /* VideoCapture */ = {
+			isa = PBXGroup;
+			children = (
+				FC803BCB214D27920094B8E5 /* FPSCounter.swift */,
+				FC803BCC214D27920094B8E5 /* VideoCapture.swift */,
+			);
+			path = VideoCapture;
+			sourceTree = "<group>";
+		};
 		FC8CFED2213519540094D569 /* Net */ = {
 			isa = PBXGroup;
 			children = (
@@ -345,9 +359,11 @@
 			buildActionMask = 2147483647;
 			files = (
 				FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */,
+				FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */,
 				FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */,
 				FCF437E8214B6DDB00943429 /* Multi-Predict-ViewController.swift in Sources */,
 				FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */,
+				FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */,
 				FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -499,7 +515,7 @@
 				DEVELOPMENT_TEAM = A798K58VVL;
 				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
@@ -526,7 +542,7 @@
 				DEVELOPMENT_TEAM = A798K58VVL;
 				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
@@ -37,7 +37,7 @@
                </viewController>
                <placeholder placeholderIdentifier="IBFirstResponder" id="68E-SG-96s" userLabel="First Responder" sceneMemberID="firstResponder"/>
            </objects>
-            <point key="canvasLocation" x="-1438" y="331"/>
+            <point key="canvasLocation" x="-559" y="686"/>
        </scene>
        <!--View Controller-->
        <scene sceneID="tne-QT-ifu">
@@ -48,7 +48,7 @@
                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                        <subviews>
                            <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
-                                <rect key="frame" x="0.0" y="20" width="375" height="247"/>
+                                <rect key="frame" x="0.0" y="20" width="225" height="247"/>
                            </imageView>
                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Thread:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
                                <rect key="frame" x="10" y="538" width="68" height="24"/>
@@ -170,9 +170,14 @@
                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Cil-py-NiA">
+                                <rect key="frame" x="225" y="20" width="150" height="247"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </view>
                        </subviews>
                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                        <constraints>
+                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="Cil-py-NiA" secondAttribute="bottom" constant="10" id="16p-IK-b5X"/>
                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="VQn-bS-fWp" secondAttribute="trailing" constant="10" id="1Xg-0h-9SE"/>
                            <constraint firstItem="avL-VK-Kha" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="2t9-hS-VXa"/>
                            <constraint firstItem="R90-Yf-S6g" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="76b-Ny-1Og"/>
@@ -187,11 +192,12 @@
                            <constraint firstItem="XpL-9M-UOp" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KWW-qT-Rzf"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="centerY" secondItem="avL-VK-Kha" secondAttribute="centerY" id="KZa-YZ-DEs"/>
                            <constraint firstItem="2EB-m2-a3L" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="Le3-TN-zOL"/>
-                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="MeS-HQ-voE"/>
+                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" constant="-150" id="MeS-HQ-voE"/>
                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="ZZh-fw-LwK" secondAttribute="bottom" constant="10" id="NUL-Ta-VI8"/>
                            <constraint firstItem="m5L-O7-P31" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="RFA-z1-9aB"/>
                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="a3K-ri-NVs" secondAttribute="width" id="Rp6-Bh-BN3"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="S0W-0G-75m"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="UNc-Et-9Yv"/>
                            <constraint firstItem="w7H-Sk-Rai" firstAttribute="leading" secondItem="wUL-9N-u1V" secondAttribute="trailing" id="VBM-8b-jP0"/>
                            <constraint firstItem="VQn-bS-fWp" firstAttribute="top" secondItem="m5L-O7-P31" secondAttribute="bottom" constant="8" id="VpS-4N-mOo"/>
                            <constraint firstItem="wUL-9N-u1V" firstAttribute="top" secondItem="2EB-m2-a3L" secondAttribute="bottom" constant="35" id="VpU-j2-gaE"/>
@@ -203,10 +209,12 @@
                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="eIC-fZ-OEE"/>
                            <constraint firstItem="976-fk-Kx2" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="fFg-pB-eyU"/>
                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="40" id="fG6-0p-I0P"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="gGK-DB-ibv"/>
                            <constraint firstItem="XpL-9M-UOp" firstAttribute="leading" secondItem="w7H-Sk-Rai" secondAttribute="trailing" id="guC-Db-cA9"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="leading" secondItem="avL-VK-Kha" secondAttribute="trailing" constant="10" id="jNW-iC-u7V"/>
                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="bottom" secondItem="6Tk-OE-BBY" secondAttribute="bottom" id="o1X-q5-P7j"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="top" secondItem="VQn-bS-fWp" secondAttribute="bottom" constant="8" id="tAE-ss-jlA"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="leading" secondItem="ZZh-fw-LwK" secondAttribute="trailing" id="teJ-PP-h2R"/>
                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="top" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="10" id="udc-wT-jqd"/>
                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" id="vXI-l2-CjL"/>
                            <constraint firstItem="VQn-bS-fWp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="wtI-Dl-YPq"/>
@@ -223,11 +231,12 @@
                        <outlet property="resultTextView" destination="VQn-bS-fWp" id="306-c7-3vM"/>
                        <outlet property="selectImageView" destination="ZZh-fw-LwK" id="afR-Bv-6AW"/>
                        <outlet property="threadPickerView" destination="DlO-dk-RMr" id="Kk4-QV-b5o"/>
+                        <outlet property="videoView" destination="Cil-py-NiA" id="QY2-BP-SNS"/>
                    </connections>
                </viewController>
                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
            </objects>
-            <point key="canvasLocation" x="-719" y="-18"/>
+            <point key="canvasLocation" x="-1543.2" y="-147.07646176911544"/>
        </scene>
    </scenes>
    <resources>

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Multi-Predict-ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Multi-Predict-ViewController.swift
@@ -14,11 +14,53 @@ class Multi_Predict_ViewController: UIViewController {
  var runner2: Runner!
  override func viewDidLoad() {
    super.viewDidLoad()
-//    let net = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
-//    runner1 = Runner.init(inNet: <#T##Net#>, commandQueue: <#T##MTLCommandQueue?#>, inPlatform: <#T##Platform#>)
+    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
+    let genet = Genet.init(device: MetalHelper.shared.device)
+    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    let queue2 = MetalHelper.shared.device.makeCommandQueue()
+    
+    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    
+   
+    
+    
+    
+    
  }

  @IBAction func predictAct(_ sender: Any) {
-    
+    let success = self.runner2.load()
+//    DispatchQueue.global().async {
+      let image1 = UIImage.init(named: "hand.jpg")
+//      let success = self.runner2.load()
+//      if success {
+//        for i in 0..<10000 {
+//          print(i)
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result1: ")
+////            print(res)
+//          })
+//        }
+//      } else {
+//        print("load failed")
+//      }
+//      self.runner1.clear()
+//    }
+//    return
+//    DispatchQueue.global().async {
+////      sleep(1)
+//      let image1 = UIImage.init(named: "banana.jpeg")
+////      if success {
+//        for _ in 0..<10 {
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result2: ")
+//            print(res)
+//          })
+//        }
+////      } else {
+////        print("load failed")
+////      }
+////      self.runner2.clear()
+//    }
  }
 }
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+
+
+import Foundation
+import QuartzCore
+
+public class FPSCounter {
+  private(set) public var fps: Double = 0
+
+  var frames = 0
+  var startTime: CFTimeInterval = 0
+
+  public func start() {
+    frames = 0
+    startTime = CACurrentMediaTime()
+  }
+
+  public func frameCompleted() {
+    frames += 1
+    let now = CACurrentMediaTime()
+    let elapsed = now - startTime
+    if elapsed > 0.1 {
+      let current = Double(frames) / elapsed
+      let smoothing = 0.75
+      fps = smoothing*fps + (1 - smoothing)*current
+      if elapsed > 1 {
+        frames = 0
+        startTime = CACurrentMediaTime()
+      }
+    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+
+import UIKit
+import Metal
+import CoreVideo
+import AVFoundation
+
+@available(iOS 10.0, *)
+@objc public protocol VideoCaptureDelegate: NSObjectProtocol {
+  @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
+}
+
+/**
+  Simple interface to the iPhone's camera.
+*/
+@available(iOS 10.0, *)
+public class VideoCapture: NSObject {
+    public var previewLayer: AVCaptureVideoPreviewLayer?
+    public weak var delegate: VideoCaptureDelegate?
+    public var fps = -1
+    private let device: MTLDevice?
+    private let videoOrientation: AVCaptureVideoOrientation
+    private var textureCache: CVMetalTextureCache?
+    private let captureSession = AVCaptureSession()
+    private let videoOutput = AVCaptureVideoDataOutput()
+    private let photoOutput = AVCapturePhotoOutput()
+    private let queue = DispatchQueue(label: "net.machinethink.camera-queue")
+    private var lastTimestamp = CMTime()
+    private let cameraPosition: AVCaptureDevice.Position
+    public init(device: MTLDevice? = nil, orientation: AVCaptureVideoOrientation = .portrait, position: AVCaptureDevice.Position = .back) {
+        self.device = device
+        self.videoOrientation = orientation
+        self.cameraPosition = position
+        super.init()
+    }
+
+    public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
+                    completion: @escaping (Bool) -> Void) {
+        queue.async {
+            let success = self.setUpCamera(sessionPreset: sessionPreset)
+            DispatchQueue.main.async {
+                completion(success)
+            }
+        }
+    }
+
+    func fontCamera() -> AVCaptureDevice? {
+        let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
+        return deveices.first
+        
+    }
+    
+    func setUpCamera(sessionPreset: AVCaptureSession.Preset) -> Bool {
+        if let inDevice = device{
+            guard CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, inDevice, nil, &textureCache) == kCVReturnSuccess else {
+                print("Error: could not create a texture cache")
+                return false
+            }
+        }
+        
+        captureSession.beginConfiguration()
+        captureSession.sessionPreset = sessionPreset
+
+        var oCaptureDevice: AVCaptureDevice?
+        switch cameraPosition {
+        case .back:
+            oCaptureDevice = AVCaptureDevice.default(for: AVMediaType.video)
+            break
+        case .front:
+            oCaptureDevice = fontCamera()
+            break
+        default:
+            break
+        }
+        
+        guard let captureDevice = oCaptureDevice else {
+            print("Error: no video devices available")
+            return false
+        }
+
+        guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
+            print("Error: could not create AVCaptureDeviceInput")
+            return false
+        }
+
+        if captureSession.canAddInput(videoInput) {
+            captureSession.addInput(videoInput)
+        }
+
+        let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
+        previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
+        previewLayer.connection?.videoOrientation = self.videoOrientation
+        self.previewLayer = previewLayer
+
+        let settings: [String : Any] = [
+        kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
+        ]
+
+        videoOutput.videoSettings = settings
+        videoOutput.alwaysDiscardsLateVideoFrames = true
+        videoOutput.setSampleBufferDelegate(self, queue: queue)
+        if captureSession.canAddOutput(videoOutput) {
+            captureSession.addOutput(videoOutput)
+        }
+
+        // We want the buffers to be in portrait orientation otherwise they are
+        // rotated by 90 degrees. Need to set this _after_ addOutput()!
+        videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
+
+        if captureSession.canAddOutput(photoOutput) {
+            captureSession.addOutput(photoOutput)
+        }
+
+        captureSession.commitConfiguration()
+        return true
+    }
+
+    public func start() {
+        if !captureSession.isRunning {
+            captureSession.startRunning()
+        }
+    }
+
+    public func stop() {
+        if captureSession.isRunning {
+            captureSession.stopRunning()
+        }
+    }
+
+    /* Captures a single frame of the camera input. */
+    public func capturePhoto() {
+        let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
+        settings.previewPhotoFormat = [
+            kCVPixelBufferPixelFormatTypeKey as String: settings.__availablePreviewPhotoPixelFormatTypes[0],
+            kCVPixelBufferWidthKey as String: 480,
+            kCVPixelBufferHeightKey as String: 360,
+        ]
+        photoOutput.capturePhoto(with: settings, delegate: self)
+    }
+
+    func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
+        if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            var texture: CVMetalTexture?
+            CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache, imageBuffer, nil, .bgra8Unorm, width, height, 0, &texture)
+            if let texture = texture {
+                return CVMetalTextureGetTexture(texture)
+            }
+        }
+        return nil
+    }
+
+    func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
+        if let sampleBuffer = sampleBuffer,
+            let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            let rect = CGRect(x: 0, y: 0, width: CGFloat(width), height: CGFloat(height))
+            let ciImage = CIImage(cvPixelBuffer: imageBuffer)
+            let ciContext = CIContext(options: nil)
+            if let cgImage = ciContext.createCGImage(ciImage, from: rect) {
+                return UIImage(cgImage: cgImage)
+            }
+        }
+        return nil
+    }
+}
+
+extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
+  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    // Because lowering the capture device's FPS looks ugly in the preview,
+    // we capture at full speed but only call the delegate at its desired
+    // framerate. If `fps` is -1, we run at the full framerate.
+    let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+    let deltaTime = timestamp - lastTimestamp
+    if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
+        lastTimestamp = timestamp
+        self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
+            let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
+            delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+        }
+    }
+  }
+
+  public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    print("dropped frame")
+  }
+}
+
+extension VideoCapture: AVCapturePhotoCaptureDelegate {
+  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
+                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
+                          previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
+                          resolvedSettings: AVCaptureResolvedPhotoSettings,
+                          bracketSettings: AVCaptureBracketedStillImageSettings?,
+                          error: Error?) {
+    var imageTexture: MTLTexture?
+    var previewImage: UIImage?
+    if error == nil {
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
+            imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
+        }
+        
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
+            previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+        }
+    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -14,13 +14,15 @@

 import UIKit
 import MetalKit
+import CoreMedia
 import paddle_mobile
 import MetalPerformanceShaders

 let platform: Platform = .GPU
 let threadSupport = [1]

-let modelHelperMap: [SupportModel : Runner] = [.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+//.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+let modelHelperMap: [SupportModel : Runner] = [
                                               .genet : Runner.init(inNet: Genet.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
                                               .mobilenet_ssd_ar : Runner.init(inNet: MobileNet_ssd_AR.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform)]
 //, .genet : Genet.init()
@@ -28,13 +30,14 @@ let modelHelperMap: [SupportModel : Runner] = [.mobilenet_ssd : Runner.init(inNe

 enum SupportModel: String{
  //  case mobilenet = "mobilenet"
-  case mobilenet_ssd    = "mobilenetssd"
+//  case mobilenet_ssd    = "mobilenetssd"
  case genet            = "genet"
  case mobilenet_ssd_ar = "mobilenetssd_ar"
  
  static func supportedModels() -> [SupportModel] {
-    //.mobilenet,
-    return [.mobilenet_ssd, .genet, .mobilenet_ssd_ar]
+    // .mobilenet,
+    // .mobilenet_ssd,
+    return [.genet, .mobilenet_ssd_ar]
  }
 }

@@ -44,14 +47,15 @@ class ViewController: UIViewController {
  @IBOutlet weak var elapsedTimeLabel: UILabel!
  @IBOutlet weak var modelPickerView: UIPickerView!
  @IBOutlet weak var threadPickerView: UIPickerView!
-  
+  @IBOutlet weak var videoView: UIView!
+  var videoCapture: VideoCapture!
+
  var selectImage: UIImage?
  var inputPointer: UnsafeMutablePointer<Float32>?
  var modelType: SupportModel = SupportModel.supportedModels()[0]
  var toPredictTexture: MTLTexture?
  
  var runner: Runner {
-    
    get {
      return modelHelperMap[modelType] ?! " has no this type "
    }
@@ -81,7 +85,7 @@ class ViewController: UIViewController {
  }
  
  @IBAction func predictAct(_ sender: Any) {
-    let max = 1
+    let max = 50
    switch platform {
    case .GPU:
      guard let inTexture = toPredictTexture else {
@@ -91,7 +95,7 @@ class ViewController: UIViewController {
      
      let startDate = Date.init()
      for i in 0..<max {
-        runner.predict(texture: inTexture) { [weak self] (success, res) in
+        runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
          guard let sSelf = self else {
            fatalError()
          }
@@ -99,11 +103,18 @@ class ViewController: UIViewController {
            if i == max - 1 {
              let time = Date.init().timeIntervalSince(startDate)
              DispatchQueue.main.async {
-                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
+//                print(resultHolder!.result![0])
+//                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+               
              }
            }
          }
+          
+          DispatchQueue.main.async {
+            resultHolder?.releasePointer()
+          }
+//            print("释放")
        }
 //        print("sleep before ")
 //        usleep(33000)
@@ -129,7 +140,7 @@ class ViewController: UIViewController {
            if i == max - 1 {
              let time = Date.init().timeIntervalSince(startDate)
              DispatchQueue.main.async {
-                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
+//                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
              }
            }
@@ -141,6 +152,13 @@ class ViewController: UIViewController {
  
  override func viewDidLoad() {
    super.viewDidLoad()
+    
+//    if runner.load() {
+//      print(" load success ! ")
+//    } else {
+//      print(" load error ! ")
+//    }
+//    
    modelPickerView.delegate = self
    modelPickerView.dataSource = self
    threadPickerView.delegate = self
@@ -158,6 +176,20 @@ class ViewController: UIViewController {
    } else {
      fatalError( " unsupport " )
    }
+    
+//    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
+//    videoCapture.fps = 30
+//    videoCapture.delegate = self
+//    videoCapture.setUp { (success) in
+//      DispatchQueue.main.async {
+//        if let preViewLayer = self.videoCapture.previewLayer {
+//          self.videoView.layer.addSublayer(preViewLayer)
+//          self.videoCapture.previewLayer?.frame = self.videoView.bounds
+//        }
+//        self.videoCapture.start()
+//      }
+//    }
+
  }
 }

@@ -218,4 +250,32 @@ extension ViewController:  UIImagePickerControllerDelegate, UINavigationControll
  }
 }

+var bool1 = false
+extension ViewController: VideoCaptureDelegate{
+  func predictTexture(texture: MTLTexture){
+    runner.scaleTexture(input: texture) { (scaledTexture) in
+      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
+//        print(resultHolder!.result![0])
+        resultHolder?.releasePointer()
+      })
+    }
+  }
+  
+  
+  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
+//    if !bool1 {
+//      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
+    self.predictTexture(texture: texture!)
+//      }
+
+      
+//      bool1 = true
+//    }
+    
+  }
+
+}
+
+
+

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -75,6 +75,12 @@
 		FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD97D2140F2C30073E130 /* libstdc++.tbd */; };
 		FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; };
 		FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC60DB8820E9AAA500FF203F /* MetalExtension.swift */; };
+		FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */; };
+		FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */; };
+		FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */; };
+		FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */; };
+		FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC6214CBA820094B8E5 /* Macro.metal */; };
+		FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */; };
 		FC82735920E3C04200BE430A /* OpCreator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC82735820E3C04200BE430A /* OpCreator.swift */; };
 		FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */; };
 		FC9D037920E229E4000F735A /* OpParam.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037820E229E4000F735A /* OpParam.swift */; };
@@ -116,6 +122,8 @@
 		FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */; };
 		FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */; };
 		FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; };
+		FCE9D7B7214F869000B520C3 /* Net.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B6214F869000B520C3 /* Net.swift */; };
+		FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */; };
 		FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; };
 		FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; };
 		FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */; };
@@ -196,6 +204,12 @@
 		FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
 		FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = "<group>"; };
 		FC60DB8820E9AAA500FF203F /* MetalExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalExtension.swift; sourceTree = "<group>"; };
+		FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluOp.swift; sourceTree = "<group>"; };
+		FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluKernel.swift; sourceTree = "<group>"; };
+		FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPreluKernel.metal; sourceTree = "<group>"; };
+		FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPrelu.inc.metal; sourceTree = "<group>"; };
+		FC803BC6214CBA820094B8E5 /* Macro.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = "<group>"; };
+		FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = "<group>"; };
 		FC82735820E3C04200BE430A /* OpCreator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpCreator.swift; sourceTree = "<group>"; };
 		FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobilenetSSD_AR.swift; sourceTree = "<group>"; };
 		FC9D037820E229E4000F735A /* OpParam.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpParam.swift; sourceTree = "<group>"; };
@@ -237,6 +251,8 @@
 		FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = "<group>"; };
 		FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
 		FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = "<group>"; };
+		FCE9D7B6214F869000B520C3 /* Net.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Net.swift; sourceTree = "<group>"; };
+		FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = "<group>"; };
 		FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
 		FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = "<group>"; };
 		FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = ConvAddBatchNormReluOp.swift; path = "paddle-mobile/Operators/ConvAddBatchNormReluOp.swift"; sourceTree = SOURCE_ROOT; };
@@ -297,6 +313,7 @@
 		FC039B6C20E11C3C0081E9F8 /* paddle-mobile */ = {
 			isa = PBXGroup;
 			children = (
+				FCE9D7B6214F869000B520C3 /* Net.swift */,
 				FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */,
 				FC33B0EF2147659000714A93 /* MobileNet.swift */,
 				FC292C862142624800CF622F /* Genet.swift */,
@@ -372,6 +389,7 @@
 				FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */,
 				FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */,
 				FCEB684B212F093800D2448E /* PreluOp.swift */,
+				FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
 			);
 			path = Operators;
 			sourceTree = "<group>";
@@ -420,6 +438,7 @@
 				4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */,
 				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
 				FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */,
+				FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
@@ -482,6 +501,11 @@
 				FC0226552138F33800F395E2 /* TransposeKernel.metal */,
 				4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */,
 				FC0226572138F38D00F395E2 /* PoolKernel.metal */,
+				FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */,
+				FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */,
+				FC803BC6214CBA820094B8E5 /* Macro.metal */,
+				FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */,
+				FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */,
 			);
 			path = metal;
 			sourceTree = "<group>";
@@ -594,6 +618,7 @@
 				FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
 				4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */,
 				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
+				FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */,
 				FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */,
 				4AF9287921341661005B6C3A /* Softmax.metal in Sources */,
 				4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */,
@@ -603,6 +628,7 @@
 				FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
 				4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */,
 				4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */,
+				FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */,
 				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
 				4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */,
 				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
@@ -622,6 +648,7 @@
 				4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
 				4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */,
+				FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
 				FC33B0F02147659000714A93 /* MobileNet.swift in Sources */,
 				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
 				4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */,
@@ -653,10 +680,12 @@
 				FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */,
 				FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */,
 				FCD04E6620F314C50007374F /* PoolOp.swift in Sources */,
+				FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */,
 				FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */,
 				FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
 				FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */,
 				FC292C872142624800CF622F /* Genet.swift in Sources */,
+				FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */,
 				4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */,
 				FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
 				FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */,
@@ -669,7 +698,9 @@
 				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
 				FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */,
 				4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */,
+				FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
 				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
+				FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
 				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
 				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
 				FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */,
@@ -689,6 +720,7 @@
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
 				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
 				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
+				FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */,
 				4AA1EA90214664CD00D0F791 /* Split.metal in Sources */,
 				FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
 				FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */,

--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.h
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
@@ -16,6 +16,12 @@

 #import <Foundation/Foundation.h>

+
+@interface CPUResult: NSObject
+@property (assign, nonatomic) float *output;
+@property (assign, nonatomic) int outputSize;
+@end
+
 @interface NMSCompute: NSObject

 @property (assign, nonatomic) float scoreThredshold;
@@ -34,6 +40,6 @@

 @property (strong, nonatomic) NSArray<NSNumber *> *bboxDim;

-(NSArray<NSNumber *> *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox;

 @end
--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
@@ -21,6 +21,8 @@
 #import <algorithm>


+
+
 struct NMSParam {
  
  float *score_data;
@@ -282,9 +284,12 @@ void MultiClassNMSCompute(NMSParam *param) {
  param->output_size = output_size;
 }

+@implementation CPUResult
+@end
+
 @implementation NMSCompute

-(NSArray<NSNumber *> *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
  NMSParam param;
  param.box_data = bbox;
  param.score_data = score;
@@ -306,12 +311,10 @@ void MultiClassNMSCompute(NMSParam *param) {
  }
  param.box_dim = box_dim;
  MultiClassNMSCompute(&param);
-  NSMutableArray<NSNumber *> *output = [NSMutableArray arrayWithCapacity:param.output_size];
-  for (int i = 0; i < param.output_size; ++i) {
-    [output addObject:[NSNumber numberWithFloat:param.output[i]]];
-  }
-  delete param.output;
-  return output;
+  CPUResult *cr = [[CPUResult alloc] init];
+  cr.output = param.output;
+  cr.outputSize = param.output_size;
+  return cr;
 }

 @end

--- a/metal/paddle-mobile/paddle-mobile/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
@@ -243,7 +243,7 @@ extension Tensor: Variant {
 extension Texture: Variant {
 }

-extension ResultHolder: Variant {
+extension GPUResultHolder: Variant {
 }

 extension InputTexture: Variant {
@@ -252,3 +252,43 @@ extension InputTexture: Variant {
 extension MTLTexture where Self: Variant {
  
 }
+
+class FetchHolder: Variant {
+  var resultBuffer: MTLBuffer?
+  var dim: [Int]
+  var capacity: Int
+  
+  init(inCapacity: Int, inDim: [Int]) {
+    capacity = inCapacity
+    dim = inDim
+  }
+  
+  func initBuffer(device: MTLDevice) {
+    resultBuffer = device.makeBuffer(length: capacity * 4, options: [])
+  }
+  
+  var result: UnsafeMutablePointer<Float32> {
+    guard let inResultBuffer = resultBuffer else {
+      fatalError()
+    }
+    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: capacity)
+  }
+  
+}
+
+extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  var debugDescription: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Genet.swift
+++ b/metal/paddle-mobile/paddle-mobile/Genet.swift
@@ -34,8 +34,9 @@ public class Genet: Net {
    }
  }
  
-  override  public func resultStr(res: [Float]) -> String {
-    return " \(Array<Float>(res.suffix(10))) ... "
+  override  public func resultStr(res: ResultHolder) -> String {
+    fatalError()
+//    return " \(Array<Float>(res.suffix(10))) ... "
  }
  
 }
--- a/metal/paddle-mobile/paddle-mobile/MobileNet.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
@@ -42,9 +42,12 @@ class MobileNet: Net{
  
  let labels = PreWords.init(fileName: "synset")
  
-  override public func resultStr(res: [Float]) -> String {
+  override public func resultStr(res: ResultHolder) -> String {
+    guard let resPointer = res.result else {
+      fatalError()
+    }
    var s: [String] = []
-    res.top(r: 5).enumerated().forEach{
+    (0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
    }
    return s.joined(separator: "\n")

--- a/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
@@ -34,51 +34,52 @@ public class MobileNet_ssd_hand: Net{
    }
  }
  
-  override public func resultStr(res: [Float]) -> String {
+  override public func resultStr(res: ResultHolder) -> String {
    return " \(res)"
  }
  
-  override func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {

-    guard let interRes = paddleMobileRes.intermediateResults else {
-      fatalError(" need have inter result ")
-    }
-
-    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
-      fatalError(" need score ")
-    }
-
-    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
-      fatalError()
-    }
-    
-    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
-//    print("score: ")
-//    print(scoreFormatArr.strideArray())
+//    guard let interRes = paddleMobileRes.intermediateResults else {
+//      fatalError(" need have inter result ")
+//    }
 //
-    var bboxArr = bbox.metalTexture.float32Array()
-//    print("bbox: ")
-//    print(bboxArr.strideArray())
-    
-    let nmsCompute = NMSCompute.init()
-    nmsCompute.scoreThredshold = 0.01
-    nmsCompute.nmsTopK = 400
-    nmsCompute.keepTopK = 200
-    nmsCompute.nmsEta = 1.0
-    nmsCompute.nmsThreshold = 0.45
-    nmsCompute.background_label = 0;
-    
-    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
-
-    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
-    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
-      fatalError( " result error " )
-    }
-
-    let output: [Float32] = result.map { $0.floatValue }
-    
-    
-    return output
+//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+//      fatalError(" need score ")
+//    }
+//
+//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+//      fatalError()
+//    }
+//
+//    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
+////    print("score: ")
+////    print(scoreFormatArr.strideArray())
+////
+//    var bboxArr = bbox.metalTexture.float32Array()
+////    print("bbox: ")
+////    print(bboxArr.strideArray())
+//
+//    let nmsCompute = NMSCompute.init()
+//    nmsCompute.scoreThredshold = 0.01
+//    nmsCompute.nmsTopK = 400
+//    nmsCompute.keepTopK = 200
+//    nmsCompute.nmsEta = 1.0
+//    nmsCompute.nmsThreshold = 0.45
+//    nmsCompute.background_label = 0;
+//
+//    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+//
+//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+//    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+//      fatalError( " result error " )
+//    }
+//
+//    let output: [Float32] = result.map { $0.floatValue }
+//
+//
+//    return output
+    fatalError()
  }
  


--- a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
@@ -34,46 +34,52 @@ public class MobileNet_ssd_AR: Net{
    }
  }
  
-  override public func resultStr(res: [Float]) -> String {
+  override public func resultStr(res: ResultHolder) -> String {
    return " \(res)"
  }
  
-  override func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
-    
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
    guard let interRes = paddleMobileRes.intermediateResults else {
      fatalError(" need have inter result ")
    }
    
-    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
      fatalError(" need score ")
    }
    
-    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
      fatalError()
    }
    
-    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
-    //    print("score: ")
-    //    print(scoreFormatArr.strideArray())
-    //
-    var bboxArr = bbox.metalTexture.float32Array()
-    //    print("bbox: ")
-    //    print(bboxArr.strideArray())
+    let startDate = Date.init()
+    
+//    print("scoreFormatArr: ")
+//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
+//
+//    print("bbox arr: ")
+//
+//    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
    
    let nmsCompute = NMSCompute.init()
-    nmsCompute.scoreThredshold = 0.01
-    nmsCompute.nmsTopK = 400
-    nmsCompute.keepTopK = 200
+    nmsCompute.scoreThredshold = 0.25
+    nmsCompute.nmsTopK = 100
+    nmsCompute.keepTopK = 100
    nmsCompute.nmsEta = 1.0
-    nmsCompute.nmsThreshold = 0.45
+    nmsCompute.nmsThreshold = 0.449999988
    nmsCompute.background_label = 0;
-    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
-    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
-    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
+    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
+    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
      fatalError( " result error " )
    }
-    
-    let output: [Float32] = result.map { $0.floatValue }
-    return output
+    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+//    for i in 0..<Int(result.outputSize) {
+//
+//      print("i \(i) : \(result.output[i])")
+//    }
+//    print(Date.init().timeIntervalSince(startDate))
+
+//    print(resultHolder.result![0])
+    return resultHolder
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Net.swift
+++ b/metal/paddle-mobile/paddle-mobile/Net.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+import Foundation
+
+public class ResultHolder: NSObject {
+  public let result: UnsafeMutablePointer<Float32>?
+  public let capacity: Int
+
+  init(inResult: UnsafeMutablePointer<Float32>?, inCapacity: Int) {
+    result = inResult
+    capacity = inCapacity
+  }
+  
+  public func releasePointer() {
+    result?.deinitialize(count: capacity)
+    result?.deallocate()
+  }
+}
+
+public class Net: NSObject {
+  var except: Int = 0
+  var means: [Float] = []
+  var scale: Float = 0.0
+  var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
+  var preprocessKernel: CusomKernel? = nil
+  var paramPointer: UnsafeMutableRawPointer? = nil
+  var paramSize: Int = 0
+  var modelPointer: UnsafeMutableRawPointer? = nil
+  var modelSize: Int = 0
+  var modelPath: String = ""
+  var paramPath: String = ""
+  var modelDir: String = ""
+  
+  public func resultStr(res: ResultHolder) -> String {
+    fatalError()
+  }
+  
+  func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    return ResultHolder.init(inResult: paddleMobileRes.resultPointer, inCapacity: paddleMobileRes.capacity)
+  }
+  
+  @objc public init(device: MTLDevice) {
+    super.init()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -64,7 +64,8 @@ class OpCreator<P: PrecisionType> {
         gBilinearInterpType        :     BilinearInterpOp<P>.creat,
         gSplit                     :     SplitOp<P>.creat,
         gShape                     :     ShapeOp<P>.creat,
-         gFlatten                   :     FlattenOp<P>.creat]
+         gFlatten                   :     FlattenOp<P>.creat,
+         gConvAddPreluType          :     ConvAddPreluOp<P>.creat]
  
    private init(){}
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -19,6 +19,12 @@ protocol Fusion {
  static func fusionNode() -> Node
  static func change() -> [String : [(from: String, to: String)]]
  static func fusionType() -> String
+  static func needCheck() -> [(Int, String)]
+}
+extension Fusion {
+  static func needCheck() -> [(Int, String)] {
+    return []
+  }
 }

 protocol Runable {
@@ -26,6 +32,7 @@ protocol Runable {
  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
  func delogOutput()
  func inputVariant() -> [String : [Variant]]
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
 }

 extension Runable where Self: OperatorProtocol{
@@ -38,11 +45,16 @@ extension Runable where Self: OperatorProtocol{
  }
  
  func inputVariant() -> [String : [Variant]] {
-    return [:]
-//    fatalError(" op \(type) need implement inputVariant")
+//    return [:]
+    fatalError(" op \(type) need implement inputVariant")
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    fatalError(" need implement ")
  }
  
  func delogOutput() {
+    
    print(type + ": has no implementation" )
  }
 }
@@ -144,6 +156,7 @@ let gBilinearInterpType         = "bilinear_interp"
 let gSplit                      = "split"
 let gShape                      = "shape"
 let gFlatten                    = "flatten"
+let gConvAddPreluType           = "conv_add_prelu"

 let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
               gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
@@ -169,5 +182,7 @@ let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Out
               gBilinearInterpType          : (inputs: ["X"], outputs: ["Out"]),
               gSplit                       : (inputs: ["X"], outputs: ["Out"]),
               gShape                       : (inputs: ["Input"], outputs: ["Out"]),
-               gFlatten                     : (inputs: ["X"], outputs: ["Out"])
+               gFlatten                     : (inputs: ["X"], outputs: ["Out"]),
+               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"])
+
              ]
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddPreluType
+  }
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
@@ -15,14 +15,15 @@
 import Foundation

 class FetchParam<P: PrecisionType>: OpParam{
-  var output: Texture<P>
+  var output: FetchHolder
  let input: Texture<P>
  let scope: Scope
  required init(opDesc: OpDesc, inScope: Scope) throws {
    scope = inScope
    do {
      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = input
+      output = FetchHolder.init(inCapacity: input.numel(), inDim: input.tensorDim.dims)
+      scope.setOutput(output: output)
    } catch let error {
      throw error
    }
@@ -34,14 +35,40 @@ class FetchParam<P: PrecisionType>: OpParam{
 class FetchKernel<P: PrecisionType>: Kernel, Computable {
  
  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
  }
  
  required init(device: MTLDevice, param: FetchParam<P>) {
-    super.init(device: device, inFunctionName: "place_holder")
+    param.output.initBuffer(device: device)
+    if computePrecision == .Float16 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch_half")
+      } else {
+//        fatalError(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder_half")
+        print(" not support ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch")
+      } else {
+        print(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder")
+//        fatalError(" not support ")        
+      }
+    } else {
+      fatalError(" not support ")
+    }
  }
 }

-class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable{
+class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
  
  typealias OpType = FetchOp<P>

@@ -50,7 +77,11 @@ class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runab
  }
  
  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    scope.setOutput(output: para.output)
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      throw PaddleMobileError.predictError(message: " encode is nil")
+//    }
+//    
+//    encoder.setTexture(param.input.metalTexture, index: 0)
+//    encoder.setTexture(param.output.metalTexture, index: 1)
+//    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+//    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+//    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+//    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+//    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+//    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
@@ -15,11 +15,41 @@
 import Foundation

 class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
-  
+  let pipline1: MTLComputePipelineState
+
  required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
-    super.init(device: device, inFunctionName: "place_holder")
+    
+    param.middleOutput.initBuffer(device: device)
+    param.bboxOutput.initBuffer(device: device)
+    if computePrecision == .Float32 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result")
+    } else if computePrecision == .Float16 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result_half")
+    } else {
+      fatalError( " unsupport precision " )
+    }
+    
  }
  
  func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.scores.metalTexture, index: 0)
+    encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
+    encoder.endEncoding()
+    
+    guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
+    encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
+    encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
+    encoderBox.endEncoding()
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -71,7 +71,6 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
  }
  
  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
-    print("reshape compute")
    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
      throw PaddleMobileError.predictError(message: " encoder is nil")
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
@@ -19,7 +19,7 @@ struct ShapeMetalParam {

 class ShapeKernel<P: PrecisionType>: Kernel, Computable{
  func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
-    print("shape compute")
+//    print("shape compute")
 //    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
 //      throw PaddleMobileError.predictError(message: " encode is nil")
 //    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -74,4 +74,6 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable {
    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
    encoder.endEncoding()
  }
+  
+
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
@@ -17,14 +17,15 @@
 using namespace metal;


-kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                              constant MetalConvParam &param [[buffer(0)]],
-                                              const device half4 *weights [[buffer(1)]],
-                                              const device half4 *biase [[buffer(2)]],
-                                              const device float4 *new_scale [[buffer(3)]],
-                                              const device float4 *new_biase [[buffer(4)]],
-                                              uint3 gid [[thread_position_in_grid]]) {
+kernel void conv_add_batch_norm_relu_1x1_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
  
  if (gid.x >= outTexture.get_width() ||
      gid.y >= outTexture.get_height() ||
@@ -41,7 +42,7 @@ kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::samp
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  half4 output = half4(0.0);
+  float4 output = float4(0.0);
  
  half4 input;
  for (uint i = 0; i < input_arr_size; ++i) {
@@ -58,19 +59,19 @@ kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::samp
    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
    output.w += dot(input, weight_w);
  }
-  
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-  outTexture.write(output, gid.xy, gid.z);
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
 }

-kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                              constant MetalConvParam &param [[buffer(0)]],
-                                              const device half4 *weights [[buffer(1)]],
-                                              const device half4 *biase [[buffer(2)]],
-                                              const device float4 *new_scale [[buffer(3)]],
-                                              const device float4 *new_biase [[buffer(4)]],
-                                              uint3 gid [[thread_position_in_grid]]) {
+kernel void conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
  
  if (gid.x >= outTexture.get_width() ||
      gid.y >= outTexture.get_height() ||
@@ -86,7 +87,7 @@ kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::samp
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  half4 output = half4(0.0);
+  float4 output = float4(0.0);
  
  half4 input[9];
  for (uint i = 0; i < input_arr_size; ++i) {
@@ -113,19 +114,19 @@ kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::samp
      output.w += dot(input[j], weight_w);
    }
  }
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-  outTexture.write(output, gid.xy, gid.z);
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
 }

-
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                                        constant MetalConvParam &param [[buffer(0)]],
-                                                        const device half *weights [[buffer(1)]],
-                                                        const device half4 *biase [[buffer(2)]],
-                                                        const device float4 *new_scale [[buffer(3)]],
-                                                        const device float4 *new_biase [[buffer(4)]],
-                                                        uint3 gid [[thread_position_in_grid]]) {
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
  
  if (gid.x >= outTexture.get_width() ||
      gid.y >= outTexture.get_height() ||
@@ -138,7 +139,7 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, ac
  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
  const uint kernelHXW = 9;
  uint weithTo = gid.z * kernelHXW * 4;
-  half4 output = half4(0.0);
+  float4 output = float4(0.0);
  half4 inputs[9];
  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -156,11 +157,12 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, ac
    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
  }
-  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-  outTexture.write(output, gid.xy, gid.z);
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
 }


+
 /*---------------------------------------------*/



--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#include "Macro.metal"
+
+
+#pragma mark - convAdd
+kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  VECTOR(P, 4) input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
+    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  
+  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device VECTOR(P, 4) *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+     const device P *alpha [[buffer(3)]],
+#endif
+     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 9;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  float4 output = float4(0.0);
+
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+
+  VECTOR(P, 4) input[9];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+
+    for (int j = 0; j < 9; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                        const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  float4 output = float4(biase[gid.z]);;
+
+  ushort dilation_y = param.dilationY;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+
+kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  float4 output = float4(biase[gid.z]);
+
+  ushort dilation_x = param.dilationX;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device P *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+    const device P *alpha [[buffer(3)]],
+#endif
+    uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(biase[gid.z]);
+  VECTOR(P, 4) inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    VECTOR(P, 4) input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+#endif
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+#define P float
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                       device float *output [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+}
+
+
+kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                  device float * output [[buffer(0)]],
+                  uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+  
+}
+
+kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+}
+
+kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+  
+}
+
+
+kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+}
+
+kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float4 *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input;
+}
+
+kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           device float4 *output [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = float4(input);
+}
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
@@ -21,10 +21,16 @@ class MulticlassNMSParam<P: PrecisionType>: OpParam {
      scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
      bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
      output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      
+      middleOutput = FetchHolder.init(inCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim.dims)
+      
+      bboxOutput = FetchHolder.init(inCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim.dims)
    } catch let error {
      throw error
    }
  }
+  var bboxOutput: FetchHolder
+  var middleOutput: FetchHolder
  let scores: Texture<P>
  let bboxes: Texture<P>
  var output: Texture<P>
@@ -33,7 +39,15 @@ class MulticlassNMSParam<P: PrecisionType>: OpParam {
 class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{

  func inputVariant() -> [String : [Variant]] {
-    return ["Scores" : [para.scores], "BBoxes" : [para.bboxes]]
+    return ["Scores" : [para.middleOutput], "BBoxes" : [para.bboxOutput]]
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let _ {
+      fatalError()
+    }
  }
  
  func inferShape() {
@@ -42,11 +56,12 @@ class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, Multic
  
  typealias OpType =  MulticlassNMSOp<P>
  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
-    }
+
+  }
+  
+  func delogOutput() {
+    print(" nms - output: ")
+    print(para.bboxes.metalTexture.float32Array().strideArray())
  }
 }


--- a/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
@@ -16,31 +16,13 @@ import Foundation

 class ScaleKernel: CusomKernel {
  init(device: MTLDevice, shape: Shape) {
-    super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
-  }
-}
-
-public class Net: NSObject {
-  var except: Int = 0
-  var means: [Float] = []
-  var scale: Float = 0.0
-  var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
-  var preprocessKernel: CusomKernel? = nil
-  var paramPointer: UnsafeMutableRawPointer? = nil
-  var paramSize: Int = 0
-  var modelPointer: UnsafeMutableRawPointer? = nil
-  var modelSize: Int = 0
-  var modelPath: String = ""
-  var paramPath: String = ""
-  var modelDir: String = ""
-  public func resultStr(res: [Float]) -> String {
-    fatalError()
-  }
-  func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
-    return paddleMobileRes.resultArr
-  }
-  @objc public init(device: MTLDevice) {
-    super.init()
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false)
+    } else {
+      fatalError(" unsupport ")
+    }
  }
 }

@@ -118,18 +100,18 @@ public class Runner: NSObject {
   * texture: 需要预测的 texture 需要做过预处理
   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
   */
-  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ resultArray: [Float32]) -> Void) {
+  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: ResultHolder?) -> Void) {
    do {
      try self.executor?.predict(input: texture, dim: [self.net.dim.n, self.net.dim.h, self.net.dim.w, self.net.dim.c], completionHandle: { [weak self] (res) in
        guard let SSelf = self else {
          fatalError( " self nil " )
        }
-        let resultArray = SSelf.net.fetchResult(paddleMobileRes: res)
-        completion(true, resultArray)
+        let result = SSelf.net.fetchResult(paddleMobileRes: res)
+        completion(true, result)
      }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
    } catch let error {
      print(error)
-      completion(false, [])
+      completion(false, nil)
      return
    }
  }
@@ -139,21 +121,21 @@ public class Runner: NSObject {
   * cgImage: 需要预测的图片
   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
   */
-  @objc public func predict(cgImage: CGImage, completion: @escaping ( _ success: Bool, _ resultArray: [Float32]) -> Void) {
-    if platform == .GPU {
-      getTexture(image: cgImage) { [weak self] (texture) in
-        guard let SSelf = self else {
-          fatalError( "" )
-        }
-        SSelf.predict(texture: texture, completion: completion)
-      }
-    } else if platform == .CPU {
-      let input = preproccess(image: cgImage)
-      predict(inputPointer: input, completion: completion)
-      input.deinitialize(count: numel)
-      input.deallocate()
-    }
-  }
+//  @objc public func predict(cgImage: CGImage, completion: @escaping ( _ success: Bool, _ resultArray: [Float32]) -> Void) {
+//    if platform == .GPU {
+//      getTexture(image: cgImage) { [weak self] (texture) in
+//        guard let SSelf = self else {
+//          fatalError( "" )
+//        }
+//        SSelf.predict(texture: texture, completion: completion)
+//      }
+//    } else if platform == .CPU {
+//      let input = preproccess(image: cgImage)
+//      predict(inputPointer: input, completion: completion)
+//      input.deinitialize(count: numel)
+//      input.deallocate()
+//    }
+//  }
  
  /*
   * 清理内存, 调用此函数后, 不能再使用, 需重新 load
@@ -184,10 +166,10 @@ public class Runner: NSObject {
   */
  @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
    let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
-    scaleTexture(input: texture!, size: (net.dim.w, net.dim.h), complete: getTexture)
+    scaleTexture(input: texture!, complete: getTexture)
  }
  
-  func scaleTexture(input: MTLTexture, size:(width: Int, height: Int), complete: @escaping (MTLTexture) -> Void) {
+  public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
    
    guard let inQueue = queue, let inDevice = device else {
      fatalError( " queue or devcie nil " )
@@ -197,7 +179,7 @@ public class Runner: NSObject {
      fatalError( " make buffer error" )
    }
    
-    let scaleKernel = ScaleKernel.init(device: inDevice, shape: CusomKernel.Shape.init(inWidth: size.width, inHeight: size.height, inChannel: 3))
+    let scaleKernel = ScaleKernel.init(device: inDevice, shape: CusomKernel.Shape.init(inWidth: net.dim.w, inHeight: net.dim.h, inChannel: 3))
    
    do {
      try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)

--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
@@ -15,209 +15,272 @@
 import Foundation

 precedencegroup ChainNode {
-    associativity: left
-    higherThan: MultiplicationPrecedence
+  associativity: left
+  higherThan: MultiplicationPrecedence
 }

 infix operator --> : ChainNode

 class Node {
-    var inputs: [Node] = []
-    var outputs: [Node] = []
-    var type: String
-    var opDesc: OpDesc?
-    init(inOpDesc: OpDesc) {
-        type = inOpDesc.type
-        opDesc = inOpDesc
+  var inputs: [Node] = []
+  var outputs: [Node] = []
+  var type: String
+  var opDesc: OpDesc?
+  init(inOpDesc: OpDesc) {
+    type = inOpDesc.type
+    opDesc = inOpDesc
+  }
+  
+  init(inType: String) {
+    type = inType
+  }
+  
+  subscript(index: Int) -> [Node] {
+    var nodes: [Node] = []
+    getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
+    return nodes
+  }
+  
+  func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
+    if index == nowIndex {
+      nodes.append(self)
    }
    
-    init(inType: String) {
-        type = inType
+    for output in outputs {
+      output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+    }
+  }
+  
+  static func -->(lNode: Node, rNode: Node) -> Node {
+    lNode.outputs.append(rNode)
+    rNode.inputs.append(lNode)
+    return rNode
+  }
+  
+  func depth(begin: UInt = 1) -> UInt {
+    var beginMax: UInt = 1
+    for output in outputs {
+      let subDepth = output.depth(begin: begin + 1)
+      beginMax = max(begin, subDepth)
+    }
+    beginMax = max(begin, beginMax)
+    return beginMax
+  }
+  
+  func to(depth: UInt) -> Node {
+    let beginNode = Node.init(inType: type)
+    to(depth: depth - 1, withNode: beginNode)
+    return beginNode
+  }
+  
+  func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
+    let fusionNode = fusion.fusionNode()
+    let change = fusion.change()
+    let inOutputs = outputs
+    outputs.removeAll()
+    opDesc?.outputs.removeAll()
+    for i in 0..<inOutputs.count {
+      inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+    }
+    opDesc?.type = fusion.fusionType()
+    type = fusion.fusionType()
+  }
+  
+  private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
+    guard let inOpdesc = opDesc else {
+      fatalError()
    }
    
-    static func -->(lNode: Node, rNode: Node) -> Node {
-        lNode.outputs.append(rNode)
-        rNode.inputs.append(lNode)
-        return rNode
+    for attr in inOpdesc.attrs {
+      beginNode.opDesc?.attrs[attr.key] = attr.value
+      //            print(beginNode.opDesc?.attrs)
    }
    
-    func depth(begin: UInt = 1) -> UInt {
-        var beginMax: UInt = 1
-        for output in outputs {
-            let subDepth = output.depth(begin: begin + 1)
-            beginMax = max(begin, subDepth)
+    for paraInput in inOpdesc.paraInputs {
+      if let inChanges = change[type] {
+        for keyChange in inChanges {
+          if keyChange.from == paraInput.key {
+            beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
+          } else {
+            beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+          }
        }
-        beginMax = max(begin, beginMax)
-        return beginMax
+      } else {
+        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+      }
    }
    
-    func to(depth: UInt) -> Node {
-        let beginNode = Node.init(inType: type)
-        to(depth: depth - 1, withNode: beginNode)
-        return beginNode
+    if matchNode.outputs.count == 0 {
+      beginNode.outputs.append(contentsOf: outputs)
+      beginNode.opDesc?.outputs = inOpdesc.outputs
+      
    }
+    removedNodes.append(self)
    
-    func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
-        let fusionNode = fusion.fusionNode()
-        let change = fusion.change()
-        let inOutputs = outputs
-        outputs.removeAll()
-        opDesc?.outputs.removeAll()
-        for i in 0..<inOutputs.count {
-            inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        opDesc?.type = fusion.fusionType()
-        type = fusion.fusionType()
+    for i in 0..<matchNode.outputs.count {
+      outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
    }
    
-    private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
-        guard let inOpdesc = opDesc else {
-            fatalError()
-        }
-        
-        for attr in inOpdesc.attrs {
-            beginNode.opDesc?.attrs[attr.key] = attr.value
-//            print(beginNode.opDesc?.attrs)
-        }
-        
-        for paraInput in inOpdesc.paraInputs {
-            if let inChanges = change[type] {
-                for keyChange in inChanges {
-                    if keyChange.from == paraInput.key {
-                        beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
-                    } else {
-                        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-                    }
-                }
-            } else {
-                beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-            }
-        }
-        
-        if matchNode.outputs.count == 0 {
-            beginNode.outputs.append(contentsOf: outputs)
-            beginNode.opDesc?.outputs = inOpdesc.outputs
-            
-        }
-        removedNodes.append(self)
-        
-        for i in 0..<matchNode.outputs.count {
-            outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        
+  }
+  
+  private func to(depth: UInt, withNode: Node) {
+    if depth < 1 {
+      return
    }
    
-    private func to(depth: UInt, withNode: Node) {
-        if depth < 1 {
-            return
-        }
-        
-        for output in outputs {
-            let node = Node.init(inType: output.type)
-            withNode.outputs.append(node)
-            output.to(depth: depth - 1, withNode: node)
-        }
+    for output in outputs {
+      let node = Node.init(inType: output.type)
+      withNode.outputs.append(node)
+      output.to(depth: depth - 1, withNode: node)
+    }
+  }
+  
+  func relationship() -> [String : Node]{
+    var map: [String : Node] = [:]
+    relationship(map: &map)
+    return map
+  }
+  
+  private func relationship(map: inout [String : Node]) {
+    guard let inOpDesc = opDesc else {
+      return
    }
    
+    for output in inOpDesc.outputs {
+      for outputKey in output.value {
+        map[outputKey] = self
+      }
+    }
    
+    for output in outputs {
+      output.relationship(map: &map)
+    }
+  }
+  
 }

 extension Node: Equatable {
-    static func == (lhs: Node, rhs: Node) -> Bool {
-        if lhs.outputs.count != rhs.outputs.count {
-            return false
-        }
-        
-        if lhs.type != rhs.type {
-            return false
-        }
-        
-        for i in 0..<lhs.outputs.count {
-            if lhs.outputs[i] != rhs.outputs[i] {
-                return false
-            }
-        }
-        return true
+  static func == (lhs: Node, rhs: Node) -> Bool {
+    if lhs.outputs.count != rhs.outputs.count {
+      return false
    }
    
+    if lhs.type != rhs.type {
+      return false
+    }
+    
+    for i in 0..<lhs.outputs.count {
+      if lhs.outputs[i] != rhs.outputs[i] {
+        return false
+      }
+    }
+    return true
+  }
+  
 }

 class ProgramOptimize<P: PrecisionType> {
-    // register fusion
-    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
-                                    ConvAddOp<P>.self,
-                                    ConvBNReluOp<P>.self,
-                                    DwConvBNReluOp<P>.self]
-    
-    func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
-        
-        guard originProgramDesc.blocks.count == 1 else {
-            fatalError(" not support yet")
+  // register fusion
+  let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+                                  ConvAddPreluOp<P>.self,
+                                  ConvAddOp<P>.self,
+                                  ConvBNReluOp<P>.self,
+                                  DwConvBNReluOp<P>.self
+  ]
+  
+  func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
+    
+    guard originProgramDesc.blocks.count == 1 else {
+      fatalError(" not support yet")
+    }
+    
+    var mapForNodeChain: [String : Node] = [:]
+    var nodes: [Node] = []
+    var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
+    let block = originProgramDesc.blocks[0]
+    for opDesc in block.ops {
+      guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
+        fatalError()
+      }
+      
+      let node = Node.init(inOpDesc: opDesc)
+      for inputKey in opInputKeys {
+        if let inputs = opDesc.inputs[inputKey] {
+          for input in inputs {
+            if let inputNode = mapForNodeChain[input] {
+              _ = inputNode --> node
+            }
+          }
        }
-        
-        var mapForNodeChain: [String : Node] = [:]
-        var nodes: [Node] = []
-        var typeMapNodes: [String : [Node]] = [:]
-        let block = originProgramDesc.blocks[0]
-            for opDesc in block.ops {
-                guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
-                    fatalError()
-                }
-                
-                let node = Node.init(inOpDesc: opDesc)
-                for inputKey in opInputKeys {
-                    if let inputs = opDesc.inputs[inputKey] {
-                        for input in inputs {
-                            if let inputNode = mapForNodeChain[input] {
-                                _ = inputNode --> node
-                            }
-                        }
-                    }
-                }
-                
-                for outputKey in outputKeys {
-                    if let outputs = opDesc.outputs[outputKey] {
-                        for output in outputs {
-                            mapForNodeChain[output] = node
-                        }
+      }
+      
+      for outputKey in outputKeys {
+        if let outputs = opDesc.outputs[outputKey] {
+          for output in outputs {
+            mapForNodeChain[output] = node
+          }
+        }
+      }
+      
+      nodes.append(node)
+      
+      if var inNodes = typeMapNodes[opDesc.type] {
+        inNodes.append((node, mapForNodeChain))
+        typeMapNodes[opDesc.type] = inNodes
+      } else {
+        typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
+      }
+    }
+    
+    for fusion in fusionOps {
+      let fusionNode = fusion.fusionNode()
+      let depth = fusionNode.depth()
+      if let toMatchNodes = typeMapNodes[fusionNode.type] {
+        for node in toMatchNodes {
+          
+          let toNode = node.node.to(depth: depth)
+          if toNode == fusionNode {   // match
+            var canFolder = true
+            let relationshipMap = toNode.relationship()
+            
+            for toCheck in fusion.needCheck() {
+              //              let nodes = toCheck
+              let checkNodes = toNode[toCheck.0]
+              
+              for checkNode in checkNodes {
+                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
+                for inputToCheck in inputToChecks {
+                  if node.output[inputToCheck] == nil {
+                    if relationshipMap[inputToCheck] == nil {
+                      canFolder = false
                    }
+                  }
                }
-                
-                nodes.append(node)
-                
-                if var inNodes = typeMapNodes[opDesc.type] {
-                    inNodes.append(node)
-                    typeMapNodes[opDesc.type] = inNodes
-                } else {
-                    typeMapNodes[opDesc.type] = [node]
-                }
+              }
            }
            
-            for fusion in fusionOps {
-                let fusionNode = fusion.fusionNode()
-                let depth = fusionNode.depth()
-                if let toMatchNodes = typeMapNodes[fusionNode.type] {
-                    for node in toMatchNodes {
-                        let toNode = node.to(depth: depth)
-                        if toNode == fusionNode {   // match
-                            var removeNodes: [Node] = []
-                            node.folderWith(fusion: fusion, removedNodes: &removeNodes)
-                            for removeNode in removeNodes {
-                                nodes.remove(element: removeNode)
-                            }
-                        }
-                    }
-                }
+            if !canFolder {
+              continue
            }
-        
-        var ops: [OpDesc] = []
-        for node in nodes {
-            ops.append(node.opDesc!)
+            
+            var removeNodes: [Node] = []
+            node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
+            for removeNode in removeNodes {
+              nodes.remove(element: removeNode)
+            }
+          }
        }
-        
-        var newProgramDesc = ProgramDesc.init()
-        let newBlock = BlockDesc.init(inVars: block.vars, inOps: ops)
-        newProgramDesc.blocks.append(newBlock)
-        return newProgramDesc
+      }
    }
+    
+    var ops: [OpDesc] = []
+    for node in nodes {
+      ops.append(node.opDesc!)
+    }
+    
+    var newProgramDesc = ProgramDesc.init()
+    let newBlock = BlockDesc.init(inVars: block.vars, inOps: ops)
+    newProgramDesc.blocks.append(newBlock)
+    return newProgramDesc
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
@@ -14,39 +14,50 @@

 import Foundation

-let testTo = 113
+
+let testTo = 81
+
 var isTest = false

-let computePrecision: ComputePrecision = .Float32
+let computePrecision: ComputePrecision = .Float16

-public class ResultHolder {
+public class GPUResultHolder {
  public let dim: [Int]
-  public let resultArr: [Float32]
+  public let capacity: Int
+  public var resultPointer: UnsafeMutablePointer<Float32>?
  public var intermediateResults: [String : [Variant]]?
  public let elapsedTime: Double
-  public init(inDim: [Int], inResult: [Float32], inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
+  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
    dim = inDim
-    resultArr = inResult
+    capacity = inCapacity
+    
+    if let inInPointer = inPointer {
+      resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
+      resultPointer?.initialize(from: inInPointer, count: inCapacity)
+    }
+    
    elapsedTime = inElapsedTime
    intermediateResults = inIntermediateResults
  }
+  
 }

-extension ResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
+extension GPUResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
  public var debugDescription: String {
-    var str = ""
-    str += "Dim: \(dim) \n value:[ "
-    if resultArr.count < 20 {
-      for d in resultArr {
-        str += " \(d) "
-      }
-    } else {
-      for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
-        str += " \(resultArr[d]) "
-      }
-    }
-    str += " ]"
-    return str
+//    var str = ""
+//    str += "Dim: \(dim) \n value:[ "
+//    if resultArr.count < 20 {
+//      for d in resultArr {
+//        str += " \(d) "
+//      }
+//    } else {
+//      for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
+//        str += " \(resultArr[d]) "
+//      }
+//    }
+//    str += " ]"
+//    return str
+    fatalError()
  }
  
  public var description: String {
@@ -67,7 +78,7 @@ public class Executor<P: PrecisionType> {
    queue = inQueue
    for block in inProgram.programDesc.blocks {
      //block.ops.count
-      for i in 0..<testTo {
+      for i in 0..<block.ops.count {
        let op = block.ops[i]
        do {
          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
@@ -79,7 +90,7 @@ public class Executor<P: PrecisionType> {
    }
  }
  
-  public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (ResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+  public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (GPUResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
    guard let buffer = queue.makeCommandBuffer() else {
      throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
    }
@@ -101,7 +112,7 @@ public class Executor<P: PrecisionType> {
    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: dim))
    program.scope.setInput(input: inputTexture)
    //(ops.count - except)
-    for i in 0..<testTo {
+    for i in 0..<(ops.count - except) {
      let op = ops[i]
      do {
        try op.run(device: device, buffer: buffer)
@@ -112,18 +123,19 @@ public class Executor<P: PrecisionType> {
    
    var outputTextures: [String : [Variant]]?
    if except > 0 {
-      outputTextures = ops[testTo-1].inputVariant()
+      ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
+      outputTextures = ops[ops.count - except].inputVariant()
    }
    
    buffer.addCompletedHandler { [weak self] (commandbuffer) in
-      
-      let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-      print(inputArr.strideArray())
-
-      print(dim)
-      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
+//      let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+//      print(inputArr.strideArray())
+//
+////      print(dim)
+//      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
+//      print(" write done ")

-      print("write to library done")
+//      print("write to library done")
 //      return
 //                  print(inputArr)
 //
@@ -131,11 +143,11 @@ public class Executor<P: PrecisionType> {
 //                  print(stridableInput)
 //
 //                  let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-      for i in 0..<testTo {
-        let op = self!.ops[i]
-        print(" 第 \(i) 个 op: ")
-        op.delogOutput()
-      }
+//      for i in 0..<self!.ops.count {
+//        let op = self!.ops[i]
+//        print(" 第 \(i) 个 op: ")
+//        op.delogOutput()
+//      }
      
 //      return;
 //      self!.ops[testTo - 2].delogOutput()
@@ -145,18 +157,23 @@ public class Executor<P: PrecisionType> {
 //      return
      
      guard let SSelf = self else {
+//        return
        fatalError()
      }
      
      let afterDate = Date.init()
-      var resultHolder: ResultHolder
+      var resultHolder: GPUResultHolder
      if except > 0 {
-        resultHolder = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
+        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
      } else {
        let outputVar: Variant = SSelf.program.scope.output()!
-        let output: Texture<P> = outputVar as! Texture<P>
+        let output: FetchHolder = outputVar as! FetchHolder
+//        let beforeToTensorDate = Date.init()
+
+        resultHolder = GPUResultHolder.init(inDim: output.dim, inPointer: output.result, inCapacity: output.capacity, inElapsedTime: afterDate.timeIntervalSince(beforeDate))
        
-        resultHolder = ResultHolder.init(inDim: output.dim.dims, inResult: output.toTensor(), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
+//        let timeToTensor = Date.init().timeIntervalSince(beforeToTensorDate)
+//        print(timeToTensor)
      }

      completionHandle(resultHolder)

--- a/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
@@ -168,7 +168,7 @@ public class Loader<P: PrecisionType> {
            }
          } else {
            if varDesc.name == fetchKey {
-              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: 0.0)
+//              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
            } else if varDesc.name == feedKey {
            }
          }