add pribox nms convTranspose prelu op

dcef687f · liuruilong · d4c97eeb · dcef687f · dcef687f · dcef687f
33 changed file
--- a/metal/PreluKernel/PreluKernel.xcodeproj/project.pbxproj
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/project.pbxproj
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		FCEB6843212F00CC00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6842212F00CC00D2448E /* PreluKernel.metal */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		FCEB683F212F00CC00D2448E /* PreluKernel.metallib */ = {isa = PBXFileReference; explicitFileType = "archive.metal-library"; includeInIndex = 0; path = PreluKernel.metallib; sourceTree = BUILT_PRODUCTS_DIR; };
+		FCEB6842212F00CC00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXGroup section */
+		FCEB6838212F00CC00D2448E = {
+			isa = PBXGroup;
+			children = (
+				FCEB6841212F00CC00D2448E /* PreluKernel */,
+				FCEB6840212F00CC00D2448E /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		FCEB6840212F00CC00D2448E /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				FCEB683F212F00CC00D2448E /* PreluKernel.metallib */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		FCEB6841212F00CC00D2448E /* PreluKernel */ = {
+			isa = PBXGroup;
+			children = (
+				FCEB6842212F00CC00D2448E /* PreluKernel.metal */,
+			);
+			path = PreluKernel;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		FCEB683E212F00CC00D2448E /* PreluKernel */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = FCEB6846212F00CC00D2448E /* Build configuration list for PBXNativeTarget "PreluKernel" */;
+			buildPhases = (
+				FCEB683D212F00CC00D2448E /* Sources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = PreluKernel;
+			productName = PreluKernel;
+			productReference = FCEB683F212F00CC00D2448E /* PreluKernel.metallib */;
+			productType = "com.apple.product-type.metal-library";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		FCEB6839212F00CC00D2448E /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0940;
+				ORGANIZATIONNAME = orange;
+				TargetAttributes = {
+					FCEB683E212F00CC00D2448E = {
+						CreatedOnToolsVersion = 9.4.1;
+					};
+				};
+			};
+			buildConfigurationList = FCEB683C212F00CC00D2448E /* Build configuration list for PBXProject "PreluKernel" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+			);
+			mainGroup = FCEB6838212F00CC00D2448E;
+			productRefGroup = FCEB6840212F00CC00D2448E /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				FCEB683E212F00CC00D2448E /* PreluKernel */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		FCEB683D212F00CC00D2448E /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FCEB6843212F00CC00D2448E /* PreluKernel.metal in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		FCEB6844212F00CC00D2448E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		FCEB6845212F00CC00D2448E /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+			};
+			name = Release;
+		};
+		FCEB6847212F00CC00D2448E /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		FCEB6848212F00CC00D2448E /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		FCEB683C212F00CC00D2448E /* Build configuration list for PBXProject "PreluKernel" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FCEB6844212F00CC00D2448E /* Debug */,
+				FCEB6845212F00CC00D2448E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		FCEB6846212F00CC00D2448E /* Build configuration list for PBXNativeTarget "PreluKernel" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FCEB6847212F00CC00D2448E /* Debug */,
+				FCEB6848212F00CC00D2448E /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = FCEB6839212F00CC00D2448E /* Project object */;
+}
--- a/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:PreluKernel.xcodeproj">
+   </FileRef>
+</Workspace>
--- a/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
--- a/metal/PreluKernel/PreluKernel.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/metal/PreluKernel/PreluKernel.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>SchemeUserState</key>
+	<dict>
+		<key>PreluKernel.xcscheme</key>
+		<dict>
+			<key>orderHint</key>
+			<integer>0</integer>
+		</dict>
+	</dict>
+</dict>
+</plist>
--- a/metal/PreluKernel/PreluKernel/PreluKernel.metal
+++ b/metal/PreluKernel/PreluKernel/PreluKernel.metal
+//
+//  PreluKernel.metal
+//  PreluKernel
+//
+//  Created by liuRuiLong on 2018/8/23.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
@@ -62,10 +62,11 @@ struct MobileNet: Net{
        fatalError("no file call \(fileName)")
      }
    }
-        subscript(index: Int) -> String{
+    subscript(index: Int) -> String {
      return contents[index]
    }
  }
+  
  let labels = PreWords.init(fileName: "synset")
  
  func resultStr(res: [Float]) -> String {
@@ -102,6 +103,166 @@ struct MobileNet_ssd_hand: Net{
    fatalError()
  }
  
+  func bboxArea(box: [Float32], normalized: Bool) -> Float32 {
+    if box[2] < box[0] || box[3] < box[1] {
+      return 0.0
+    } else {
+      let w = box[2] - box[0]
+      let h = box[3] - box[1]
+      if normalized {
+        return w * h
+      } else {
+        return (w + 1) * (h + 1)
+      }
+    }
+  }
+  
+  
+  func jaccardOverLap(box1: [Float32], box2: [Float32], normalized: Bool) -> Float32 {
+    if box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1] {
+      return 0.0
+    } else {
+      let interXmin = max(box1[0], box2[0])
+      let interYmin = max(box1[1], box2[1])
+      let interXmax = min(box1[2], box2[2])
+      let interYmax = min(box1[3], box2[3])
+      let interW = interXmax - interXmin
+      let interH = interYmax - interYmin
+      let interArea = interW * interH
+      let bbox1Area = bboxArea(box: box1, normalized: normalized)
+      let bbox2Area = bboxArea(box: box2, normalized: normalized)
+      return interArea / (bbox1Area + bbox2Area - interArea)
+    }
+  }
+  
+  func fetchResult(paddleMobileRes: [String : Texture<Float32>]) -> [Float32]{
+    let bbox = paddleMobileRes["box_coder_0.tmp_0"] ?! " no bbox "
+    let scores = paddleMobileRes["transpose_12.tmp_0"] ?! " no scores "
+    let score_thredshold: Float32 = 0.01
+    let nms_top_k = 400
+    let keep_top_k = 200
+    let nms_eta: Float32 = 1.0
+    var nms_threshold: Float32 = 0.45
+    
+    let bboxArr = bbox.metalTexture.floatArray { (f) -> Float32 in
+      return f
+    }
+    
+    let scoresArr = scores.metalTexture.floatArray { (f) -> Float32 in
+      return f
+    }
+    
+    var scoreFormatArr: [Float32] = []
+    var outputArr: [Float32] = []
+    
+    let numOfOneC = (scores.originDim[2] + 3) / 4   // 480
+    let cNumOfOneClass = numOfOneC * 4              // 1920
+    
+    let boxSize = bbox.originDim[2]                 // 4
+    let classNum = scores.originDim[1]              // 7
+    let classNumOneTexture = classNum * 4           // 28
+    
+    for c in 0..<classNum {
+      for n in 0..<numOfOneC {
+        let to = n * classNumOneTexture + c * 4
+        scoreFormatArr.append(scoresArr[to])
+        scoreFormatArr.append(scoresArr[to + 1])
+        scoreFormatArr.append(scoresArr[to + 2])
+        scoreFormatArr.append(scoresArr[to + 3])
+      }
+    }
+    
+    var selectedIndexs: [Int : [(Int, Float32)]] = [:]
+    
+    var numDet: Int = 0
+    
+    for i in 0..<classNum {
+      var sliceScore = scoreFormatArr[(i * cNumOfOneClass)..<((i + 1) * cNumOfOneClass)]
+      
+      var scoreThresholdArr: [(Float32, Int)] = []
+      
+      for i in 0..<cNumOfOneClass {
+        if sliceScore[i] > score_thredshold {
+          scoreThresholdArr.append((sliceScore[i], i))
+        }
+      }
+      
+      scoreThresholdArr.sort { $0 > $1 }
+      
+      if scoreThresholdArr.count > nms_top_k {
+        scoreThresholdArr.removeLast(scoreThresholdArr.count - nms_top_k)
+      }
+      
+      var selectedIndex: [(Int, Float32)] = []
+      
+      while scoreThresholdArr.count > 0 {
+        let idx = scoreThresholdArr[0].1
+        let score = scoreThresholdArr[0].0
+        var keep = true
+        for j in 0..<selectedIndex.count {
+          if keep {
+            let keptIdx = selectedIndex[j].0
+            let box1 = Array<Float32>(bboxArr[(idx * boxSize)..<(idx * boxSize + 4)])
+            let box2 = Array<Float32>(bboxArr[(idx * boxSize)..<(keptIdx * boxSize + 4)])
+            
+            let overlap = jaccardOverLap(box1: box1, box2: box2, normalized: true)
+            keep = (overlap <= nms_threshold)
+          } else {
+            break
+          }
+        }
+        
+        if keep {
+          selectedIndex.append((idx, score))
+        }
+        
+        scoreThresholdArr.removeFirst()
+        if keep && nms_eta < 1.0 && nms_threshold > 0.5 {
+          nms_threshold *= nms_eta
+        }
+      }
+      selectedIndexs[i] = selectedIndex
+      numDet += selectedIndex.count
+    }
+    
+    var scoreIndexPairs: [(Float32, (Int, Int))] = []
+    for selected in selectedIndexs {
+      for scoreIndex in selected.value {
+        scoreIndexPairs.append((scoreIndex.1, (selected.key, scoreIndex.0)))
+      }
+    }
+    
+    scoreIndexPairs.sort { $0.0 > $1.0 }
+  
+    if scoreIndexPairs.count > keep_top_k {
+      scoreIndexPairs.removeLast(scoreIndexPairs.count - keep_top_k)
+    }
+    
+    var newIndices: [Int : [(Int, Float32)]] = [:]
+    for scoreIndexPair in scoreIndexPairs {
+      // label: scoreIndexPair.1.0
+      let label = scoreIndexPair.1.0
+      if newIndices[label] != nil {
+        newIndices[label]?.append((scoreIndexPair.1.0, scoreIndexPair.0))
+      } else {
+        newIndices[label] = [(scoreIndexPair.1.0, scoreIndexPair.0)]
+      }
+    }
+    
+    for indice in newIndices {
+      let selectedIndexAndScore = indice.value
+      for indexAndScore in selectedIndexAndScore {
+        outputArr.append(Float32(indice.key))   // label
+        outputArr.append(indexAndScore.1)   // score
+        let subBox = bboxArr[(indexAndScore.0 * boxSize)..<(indexAndScore.0 * boxSize + 4)]
+        outputArr.append(contentsOf: subBox)
+      }
+    }
+    
+    return outputArr
+  }
+  
  var preprocessKernel: CusomKernel
  let dim = [1, 300, 300, 3]
  let modelPath: String

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -61,7 +61,6 @@ class ViewController: UIViewController {
    executor?.clear()
    program = nil
    executor = nil
-        
  }
  
  @IBAction func predictAct(_ sender: Any) {

--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
-//
-//  AppDelegate.swift
-//  paddle-mobile-unit-test
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import UIKit


--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
-//
-//  ViewController.swift
-//  paddle-mobile-unit-test
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import UIKit
 import Metal

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -69,6 +69,14 @@
 		FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7120F343420007374F /* ConvAddOp.swift */; };
 		FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7320F3437E0007374F /* ConvAddKernel.swift */; };
 		FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDC0FEA21099A1D00DC9EFB /* Tools.swift */; };
+		FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */; };
+		FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */; };
+		FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */; };
+		FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */; };
+		FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */; };
+		FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; };
+		FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; };
+		FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; };
 		FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */; };
 		FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */; };
 		FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2D73720E64E70007AC5F5 /* Kernel.swift */; };
@@ -141,9 +149,17 @@
 		FCD04E7120F343420007374F /* ConvAddOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddOp.swift; sourceTree = "<group>"; };
 		FCD04E7320F3437E0007374F /* ConvAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddKernel.swift; sourceTree = "<group>"; };
 		FCDC0FEA21099A1D00DC9EFB /* Tools.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tools.swift; sourceTree = "<group>"; };
+		FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = BatchNormKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
+		FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = "<group>"; };
+		FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
+		FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = "<group>"; };
 		FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = ConvAddBatchNormReluOp.swift; path = "paddle-mobile/Operators/ConvAddBatchNormReluOp.swift"; sourceTree = SOURCE_ROOT; };
 		FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluKernel.swift; sourceTree = "<group>"; };
-		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Kernel.swift"; sourceTree = SOURCE_ROOT; };
+		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Base/Kernel.swift"; sourceTree = SOURCE_ROOT; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -255,6 +271,8 @@
 				FCBCCC66212306B000D94F7E /* ConcatOp.swift */,
 				FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */,
 				FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */,
+				FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */,
+				FCEB684B212F093800D2448E /* PreluOp.swift */,
 			);
 			path = Operators;
 			sourceTree = "<group>";
@@ -279,15 +297,15 @@
 		FC086BA520E67E8500D85EF7 /* Kernels */ = {
 			isa = PBXGroup;
 			children = (
+				FCDDC6CD212FE02100E5EF74 /* Base */,
+				FCEB6837212F00B100D2448E /* metal */,
+				FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
 				FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
-				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
-				FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
 				FC1B186520ECF1C600678B91 /* ResizeKernel.swift */,
 				FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
 				FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
 				FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
 				FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */,
-				FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
 				FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */,
 				FCD04E6720F315020007374F /* PoolKernel.swift */,
 				FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */,
@@ -299,6 +317,7 @@
 				FCBCCC68212306D300D94F7E /* ConcatKernel.swift */,
 				FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */,
 				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
+				FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
@@ -313,6 +332,27 @@
 			path = Base;
 			sourceTree = "<group>";
 		};
+		FCDDC6CD212FE02100E5EF74 /* Base */ = {
+			isa = PBXGroup;
+			children = (
+				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
+			);
+			path = Base;
+			sourceTree = "<group>";
+		};
+		FCEB6837212F00B100D2448E /* metal */ = {
+			isa = PBXGroup;
+			children = (
+				FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
+				FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
+				FCEB6849212F00DB00D2448E /* PreluKernel.metal */,
+				FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */,
+				FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */,
+				FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */,
+			);
+			path = metal;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */

 /* Begin PBXHeadersBuildPhase section */
@@ -417,6 +457,7 @@
 				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
 				FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
 				FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
+				FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */,
 				FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
 				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
 				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
@@ -426,11 +467,15 @@
 				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
 				FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
 				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
+				FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
+				FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
 				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
 				FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
+				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
 				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
 				FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */,
+				FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */,
 				FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */,
 				FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */,
 				FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */,
@@ -456,6 +501,7 @@
 				FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */,
 				FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */,
 				FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */,
+				FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */,
 				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
 				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
 				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
@@ -468,6 +514,8 @@
 				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
 				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
 				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
+				FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */,
+				FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */,
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
 				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
 				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,

--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -17,11 +17,13 @@ import Foundation
 public class ResultHolder<P: PrecisionType> {
    public let dim: [Int]
    public let resultArr: [P]
+    public var intermediateResults: [Texture<P>]?
    public let elapsedTime: Double
-    public init(inDim: [Int], inResult: [P], inElapsedTime: Double) {
+    public init(inDim: [Int], inResult: [P], inElapsedTime: Double, inIntermediateResults: [Texture<P>]? = nil) {
        dim = inDim
        resultArr = inResult
        elapsedTime = inElapsedTime
+        intermediateResults = inIntermediateResults
    }
 }

@@ -116,7 +118,6 @@ public class Executor<P: PrecisionType> {
            
 //            self.ops[2].delogOutput()
            
-            
            let afterDate = Date.init()
            
            guard let outputVar = self.program.scope.output() else {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
@@ -39,8 +39,19 @@ protocol OpParam {
  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+  
+  static func inputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
 }

 extension OpParam {
@@ -58,16 +69,52 @@ extension OpParam {
    return v
  }
  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
+      return tensorVariances
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
+      return alphaTensor
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
+      return tensorImage
+    } catch let error {
+      throw error
+    }
+  }
+  
  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
    do {
      let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
-            
      return tensorX
    } catch let error {
      throw error
    }
  }
  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
+      return tensorBox
+    } catch let error {
+      throw error
+    }
+  }
+  
  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
    do {
      let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -14,10 +14,12 @@

 import Foundation

+
 class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
    typealias ParamPrecisionType = P
    required init(opDesc: OpDesc, inScope: Scope) throws {
        do {
+          
            filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
            input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
            output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
@@ -29,6 +31,7 @@ class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
            groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
            variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
            bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+          
            scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
            mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
            y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -61,6 +61,7 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
    typealias OpType = ConvAddOp<P>
    
    func inferShape() {
+        
        let inDims = para.input.dim
        let filterDim = para.filter.dim
        let strides = para.stride

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+
+import Foundation
+
+class ConvTransposeParam<P: PrecisionType>: ConvParam<P> {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      try super.init(opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+}
+
+class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTransposeParam<P>>, Runable, Creator, InferShaperable{
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  typealias OpType = ConvTransposeOp<P>
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  func delogOutput() {
+    print("conv transpose delog")
+    let _: P? = para.input.metalTexture.logDesc(header: "conv transpose input: ", stridable: true)
+    let _: P? = para.output.metalTexture.logDesc(header: "conv transpose output: ", stridable: true)
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernel.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -110,6 +110,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
            throw PaddleMobileError.predictError(message: " encode is nil")
        }
      
+
        encoder.setTexture(param.input.metalTexture, index: 0)
        encoder.setTexture(param.output.metalTexture, index: 1)
        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -13,12 +13,20 @@
 limitations under the License. */

 import Foundation
-import MetalPerformanceShaders

 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: MetalConvParam!
  required init(device: MTLDevice, param: ConvAddParam<P>) {
+    if param.filter.width == 1 && param.filter.height == 1 {
      super.init(device: device, inFunctionName: "conv_add_1x1")
+    } else if param.filter.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_add_3x3")
+    }
+    
+    param.output.initTexture(device: device, transpose: [0, 3, 1, 2])
+    
    let offsetX = param.filter.width/2 - Int(param.paddings[0])
    let offsetY = param.filter.height/2 - Int(param.paddings[1])
    
@@ -33,8 +41,6 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
  }
  
  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
-        
-        
    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
      throw PaddleMobileError.predictError(message: " encode is nil")
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -13,6 +13,7 @@
 limitations under the License. */

 import Foundation
+import MetalPerformanceShaders

 struct ConvBNReluTestParam: TestParam {
    let inputTexture: MTLTexture
@@ -24,6 +25,7 @@ struct ConvBNReluTestParam: TestParam {
    let newBiaseBuffer: MTLBuffer
    let filterSize: (width: Int, height: Int, channel: Int)
    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+        
        inputTexture = inInputTexture
        outputTexture = inOutputTexture
        metalParam = inMetalParam

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -14,7 +14,6 @@

 import Foundation

-
 public struct MetalConvParam {
  let offsetX: Int16
  let offsetY: Int16
@@ -27,7 +26,14 @@ public struct MetalConvParam {
 class ConvKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: MetalConvParam!
  required init(device: MTLDevice, param: ConvParam<P>) {
-        super.init(device: device, inFunctionName: "conv_add_1x1")
+    if param.filter.width == 1 && param.filter.height == 1 {
+      super.init(device: device, inFunctionName: "conv_1x1")
+    } else if param.filter.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_3x3")
+    }
+    
    let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
    let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
    let offsetZ = 0.0
@@ -49,3 +55,5 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
    encoder.endEncoding()
  }
 }
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct MetalConvTransposeParam {
+  let kernelW: UInt16;
+  let kernelH: UInt16;
+  
+  let strideX: UInt16;
+  let strideY: UInt16;
+  
+  let paddingX: UInt16;
+  let paddingY: UInt16;
+  
+  let dilationX: UInt16;
+  let dilationY: UInt16;
+}
+
+class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: MetalConvTransposeParam!
+  required init(device: MTLDevice, param: ConvTransposeParam<P>) {
+    super.init(device: device, inFunctionName: "conv_transpose")
+    let kernelWidth = UInt16(param.filter.width)
+    let kernelHeight = UInt16(param.filter.height)
+    
+    let strideX = UInt16(param.stride[0])
+    let strideY = UInt16(param.stride[1])
+    let paddingX = UInt16(param.paddings[0])
+    let paddingY = UInt16(param.paddings[1])
+    let dilationX = UInt16(param.dilations[0])
+    let dilationY = UInt16(param.dilations[1])
+    
+    metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+    
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+//
+//  PreluKernel.swift
+//  paddle-mobile
+//
+//  Created by liuRuiLong on 2018/8/24.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+import Foundation
+
+class PreluKernel<P: PrecisionType>: Kernel, Computable{
+  required init(device: MTLDevice, param: PreluParam<P>) {
+    if param.mode == "channel" {
+      super.init(device: device, inFunctionName: "prelu_channel")
+    } else if param.mode == "element" {
+      super.init(device: device, inFunctionName: "prelu_element")
+    } else {
+      super.init(device: device, inFunctionName: "prelu_other")
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -14,18 +14,89 @@

 import Foundation

+struct PriorBoxMetalParam {
+    let offset: Float32
+    let stepWidth: Float32
+    let stepHeight: Float32
+    let minSize: Float32
+    let maxSize: Float32
+    let imageWidth: Float32
+    let imageHeight: Float32
+    let clip: Bool
+    let numPriors: uint
+    let aspecRatiosSize: uint
+    let minSizeSize: uint
+    let maxSizeSize: uint
+}
+
 class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
+    var metalParam: PriorBoxMetalParam!
+    
+    required init(device: MTLDevice, param: PriorBoxParam<P>) {
+        super.init(device: device, inFunctionName: "priorbox")
+        param.output.initTexture(device: device, transpose: [2, 0, 1, 3])
+        param.outputVariances.initTexture(device: device, transpose: [2, 0, 1, 3])
+        
+        let imageWidth = Float32(param.inputImage.originDim[3])
+        let imageHeight = Float32(param.inputImage.originDim[2])
+        
+        let featureWidth = param.inputImage.originDim[3]
+        let featureHeight = param.inputImage.originDim[2]
+       
+        if param.stepW == 0 || param.stepH == 0 {
+            param.stepW = Float32(imageWidth) / Float32(featureWidth)
+            param.stepH = Float32(imageHeight) / Float32(featureHeight)
+        }
+      
+        var outputAspectRatior: [Float32] = []
+        outputAspectRatior.append(1.0)
+        
+        let epsilon = 1e-6
+        for ar in param.aspectRatios {
+            var alreadyExist = false
+            for outputAr in outputAspectRatior {
+                if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
+                    alreadyExist = true
+                    break
+                }
+            }
+            
+            if !alreadyExist {
+                outputAspectRatior.append(ar)
+            }
+            if param.flip {
+                outputAspectRatior.append(1.0 / ar)
+            }
+        }
+        
+        param.newAspectRatios = outputAspectRatior
+        let aspectRatiosSize = uint(outputAspectRatior.count)
+                
+        let maxSizeSize: uint = uint(param.maxSizes.count)
+        let minSizeSize: uint = uint(param.minSizes.count)
+        
+        let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
+        
+        let minSize = param.minSizes.last ?? 0.0
+        let maxSize = param.maxSizes.last ?? 0.0
+        
+        metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
+        
+    }
+    
    func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
            throw PaddleMobileError.predictError(message: " encode is nil")
        }
        encoder.setTexture(param.input.metalTexture, index: 0)
        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setTexture(param.outputVariances.metalTexture, index: 2)
+        encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 0)
+        encoder.setBytes(param.aspectRatios, length: MemoryLayout<Float32>.size * param.aspectRatios.count, index: 1)
+        encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
        encoder.endEncoding()
    }
    
-    required init(device: MTLDevice, param: PriorBoxParam<P>) {
-        super.init(device: device, inFunctionName: "priorbox")
-    }
+   
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      const device half4 * newScale [[buffer(0)]],
+                      const device half4 * newBias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  half4 output = input * newScale[gid.z] + newBias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           const device float4 * newScale [[buffer(0)]],
+                           const device float4 * newBias [[buffer(1)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  float4 output = input * newScale[gid.z] + newBias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
@@ -314,6 +314,252 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
    outTexture.write(output, gid.xy, gid.z);
 }

+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    uint output_slice = gid.z;
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    uint weithTo = gid.z * kernelHXW * 4;
+    float4 output = float4(0.0);
+    float4 inputs[9];
+    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+    for (int j = 0; j < 9; ++j) {
+        float4 input = inputs[j];
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+    }
+    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
+}
+
+struct MetalConvTransposeParam{
+  ushort kernelW;
+  ushort kernelH;
+  
+  ushort strideX;
+  ushort strideY;
+  
+  ushort paddingX;
+  ushort paddingY;
+  
+  ushort dilationX;
+  ushort dilationY;
+};
+
+kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           constant MetalConvTransposeParam &param [[buffer(0)]],
+                           const device float4 *weights [[buffer(1)]],
+                           uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  
+  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
+
+  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  float4 output;
+  
+  for (int w = 0; w < param.kernelW; ++w) {
+    int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX;
+    if (input_x < 0 || input_x >= int(inTexture.get_width())) {
+      continue;
+    }
+    
+    for (int h = 0; h < param.kernelH; ++h) {
+      int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY;
+      if (input_y < 0 || input_y >= int(inTexture.get_height())) {
+        continue;
+      }
+      
+      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
+      
+      for (int slice = 0; slice < input_array_size; ++slice) {
+        
+        float4 input;
+        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
+        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
+
+        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
+
+        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
+        
+        input = inTexture.sample(sample, float2(input_x,    input_x), slice);
+        output.x += dot(input, kernel_slice);
+        output.x += dot(input, kernel_slice1);
+        output.x += dot(input, kernel_slice2);
+        output.x += dot(input, kernel_slice3);
+      }
+    }
+  }
+
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+// conv
+#pragma mark -- conv
+kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device float *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#pragma mark - convAdd
 kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                         texture2d_array<float, access::write> outTexture [[texture(1)]],
                         constant MetalConvParam &param [[buffer(0)]],
@@ -357,7 +603,61 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
  outTexture.write(output, gid.xy, gid.z);
 }

-kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
                                                   constant MetalConvParam &param [[buffer(0)]],
                                                   const device float *weights [[buffer(1)]],
@@ -395,6 +695,7 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
  }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  output = output + biase[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
@@ -36,18 +36,6 @@ kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
    outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
 }

-kernel void relu(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    const float4 relu = fmax((float4)input, 0.0);
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
 kernel void elementwise_add(texture2d_array<half, access::read> inTexture [[texture(0)]],
                            texture2d_array<half, access::write> outTexture [[texture(1)]],
                            const device half4 *biasTerms [[buffer(0)]],
@@ -60,18 +48,6 @@ kernel void elementwise_add(texture2d_array<half, access::read> inTexture [[text
    outTexture.write(input, gid.xy, gid.z);
 }

-kernel void batchnorm(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      const device half4 * newScale [[buffer(0)]],
-                      const device half4 * newBias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    half4 output = input * newScale[gid.z] + newBias[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}

 //kernel void texture2d_to_2d_array(texture2d<half, access::read> inTexture [[texture(0)]],
 //                               texture2d_array<half, access::write> outTexture [[texture(1)]],
@@ -230,76 +206,6 @@ kernel void softmax_half(texture2d_array<half, access::read> inTexture [[texture
    outTexture.write(rr, gid.xy, gid.z);
 }

-kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                      texture2d_array<float, access::write> outTexture [[texture(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    
-    int max_sizes_size;
-    float max_sizes[2];
-    
-    bool clip;
-    
-    float img_width;
-    float img_height;
-    
-    float step_width;
-    float step_height;
-    float offset;
-    
-    float aspect_ratios[2];
-    int aspect_ratios_size;
-    
-    float center_x = (gid.x + offset) * step_width;
-    float center_y = (gid.y + offset) * step_width;
-    
-    float box_width, box_height;
-    
-    int min_sizes_size;
-    float min_sizes[2];
-    
-    float min_size;
-    float max_size;
-    
-    if (gid.z < aspect_ratios_size) {
-        float ar = aspect_ratios[gid.z];
-        box_width = min_size * sqrt(ar) / 2;
-        box_height = min_size / sqrt(ar) / 2;
-        float4 box;
-        box.x = (center_x - box_width) / img_width;
-        box.y = (center_y - box_height) / img_height;
-        box.z = (center_x + box_width) / img_width;
-        box.w = (center_y + box_height) / img_height;
-        
-        float4 res;
-        if (clip) {
-            res = min(max(box, 0.0), 1.0);
-        } else {
-            res = box;
-        }
-    
-        outTexture.write(res, gid.xy, gid.z);
-    } else if (gid.z >= aspect_ratios_size) {
-        int max_index = gid.z - aspect_ratios_size;
-        if (max_sizes_size > 0 && min_sizes_size > 0) {
-            box_width = box_height = sqrt(min_size * max_size) / 2;
-            float4 max_box;
-            max_box.x = (center_x - box_width) / img_width;
-            max_box.y = (center_y - box_height) / img_height;
-            max_box.z = (center_x + box_width) / img_width;
-            max_box.w = (center_y + box_height) / img_height;
-            
-            float4 res;
-            if (clip) {
-                res = min(max(max_box, 0.0), 1.0);
-            } else {
-                res = max_box;
-            }
-            
-            outTexture.write(max_box, gid.xy, gid.z);
-        }
-    }
-}
-
 inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
    abcd[2] = xyzn[0];
    abcd[1] = xyzn[1];

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+
+kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           const device float4 *alpha [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, gid.x, gid.y, gid.z);
+  float4 output;
+  output.x = input.x > 0 ? input.x : alpha[gid.z].x;
+  output.x = input.y > 0 ? input.y : alpha[gid.z].y;
+  output.x = input.z > 0 ? input.z : alpha[gid.z].z;
+  output.x = input.w > 0 ? input.w : alpha[gid.z].w;
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, gid.x, gid.y, gid.z);
+  
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  
+  float4 output;
+  output.x = input.x > 0 ? input.x : alpha[alpha_to + gid.z].x;
+  output.x = input.y > 0 ? input.y : alpha[alpha_to + gid.z].y;
+  output.x = input.z > 0 ? input.z : alpha[alpha_to + gid.z].z;
+  output.x = input.w > 0 ? input.w : alpha[alpha_to + gid.z].w;
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, gid.x, gid.y, gid.z);
+  
+  float4 output;
+  output.x = input.x > 0 ? input.x : alpha[0];
+  output.x = input.y > 0 ? input.y : alpha[0];
+  output.x = input.z > 0 ? input.z : alpha[0];
+  output.x = input.w > 0 ? input.w : alpha[0];
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct PriorBoxMetalParam {
+  float offset;
+  float stepWidth;
+  float stepHeight;
+  float minSize;
+  float maxSize;
+  float imageWidth;
+  float imageHeight;
+  
+  bool clip;
+  
+  uint numPriors;
+  uint aspecRatiosSize;
+  uint minSizeSize;
+  uint maxSizeSize;
+};
+
+kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      constant PriorBoxMetalParam &param [[buffer(0)]],
+                      const device float *aspect_ratios [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    float ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = min(max(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(res, gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+      
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(max_box, gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+  }
+}
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                 texture2d_array<half, access::write> outTexture [[texture(1)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(half4(relu), gid.xy, gid.z);
+}
+
+kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(float4(relu), gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+
+import Foundation
+
+class PreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      alpha = try PreluParam.inputAlpha(inputs: opDesc.inputs, from: inScope)
+      mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let mode: String
+  let alpha: Tensor<P>
+  let input: Texture<P>
+  var output: Texture<P>
+}
+
+class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  typealias OpType = PreluOp<P>
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  func delogOutput() {
+    print("softmax delog")
+    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
+    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -19,27 +19,49 @@ class PriorBoxParam<P: PrecisionType>: OpParam {
    required init(opDesc: OpDesc, inScope: Scope) throws {
        do {
            input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try PriorBoxParam.getFirstTensor(key: "Boxes", map: opDesc.outputs, from: inScope)
-            variances = try PriorBoxParam.getFirstTensor(key: "Variances", map: opDesc.outputs, from: inScope)
+            output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
+            inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
+            outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
+            minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
+            maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
+            aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
+            variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
+            flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
+            clip = try PriorBoxParam.getAttr(key: "clop", attrs: opDesc.attrs)
+            stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
+            stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
+            offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
        } catch let error {
            throw error
        }
    }
+    
+    let minSizes: [Float32]
+    let maxSizes: [Float32]
+    let aspectRatios: [Float32]
+    var newAspectRatios: [Float32]?
+    let variances: [Float32]
+    let flip: Bool
+    let clip: Bool
+    var stepW: Float32
+    var stepH: Float32
+    let offset: Float32
+    
    let input: Texture<P>
+    let inputImage: Texture<P>
    var output: Texture<P>
-    let variances: Texture<P>
+    let outputVariances: Texture<P>
 }

 class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
    
    func inferShape() {
-        para.output.dim = para.input.dim
    }
    
    typealias OpType = PriorBoxOp<P>
    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
        do {
-            // try kernel.compute(commandBuffer: buffer, param: para)
+             try kernel.compute(commandBuffer: buffer, param: para)
        } catch let error {
            throw error
        }

--- a/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
@@ -31,11 +31,11 @@ public struct Dim {
        return dims.reduce(1) { $0 * $1 }
    }
    
-    static func ==(left: Dim, right: Dim) -> Bool {
+    public static func ==(left: Dim, right: Dim) -> Bool {
        return left.dims == right.dims;
    }
    
-    subscript(index: Int) -> Int {
+    public subscript(index: Int) -> Int {
        return dims[index];
    }
    

--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -41,15 +41,15 @@ extension InputTexture {
 public class Texture<P: PrecisionType>: Tensorial {
    var dim: Dim
    var tensorDim: Dim
-    private(set) var originDim: Dim
+    private(set) public var originDim: Dim
    private var textureDesc: MTLTextureDescriptor!
-    var metalTexture: MTLTexture!
+    public var metalTexture: MTLTexture!
    var transpose: [Int] = [0, 1, 2, 3]
    
    func initTexture(device: MTLDevice, transpose: [Int] = [0, 1, 2, 3]) {
        let newDim = transpose.map { originDim[$0] }
        
-        let newLayout = transpose.map {layout.layoutWithDim[$0] }
+        let newLayout = transpose.map { layout.layoutWithDim[$0] }
        
        layout = DataLayout.init(newLayout)
        dim = Dim.init(inDim: newDim)