Merge branch 'develop' into develop

13f2079a · Jiaying Zhao · GitHub · 05ad675d · ca32dc7f · 13f2079a
167 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@
 *.lai
 *.la
 *.lib
+*.a

 # Executables
 *.exe
@@ -70,7 +71,10 @@ build
 cmake-build-debug
 cmake-build-release

-#ios demo
+# ios
+tools/libomp.a
+
+# ios demo
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
@@ -84,6 +88,7 @@ SwiftProtobuf.framework
 paddle-mobile.xcworkspace
 metal/models/
 metal/images/
-
-
-tools/libomp.a
\ No newline at end of file
+*.a
+metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
+*.xcuserdatad/
+*/xcuserdata/
--- a/README.md
+++ b/README.md
@@ -69,8 +69,18 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平

 - **苹果设备的GPU Metal实现**

-    基于Metal实现的苹果设备的GPU预测库，也已经在实现中，近期也会有相应可运行版本。
-     
+|mobilenetfssd|速度|
+|------------|-----|
+|A9(ms)|33.78|
+|A10(ms)|24.05|
+|A11(ms)|17.15|
+|||
+|genet|速度|
+|A9(ms) |3.49|
+|A10(ms)|2.54|
+|A11(ms)|1.43|
+
+
 - **FPGA**

    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。

--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
@@ -8,22 +8,29 @@

 /* Begin PBXBuildFile section */
 		30D0ED21F392CFA3885B1002 /* Pods_paddle_mobile_demo.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */; };
+		C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = C2E67E5D21524E460013F575 /* LoadPointerViewController.m */; };
 		FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC013927210204A3008100E3 /* PreProcessKernel.metal */; };
 		FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8120E11C550081E9F8 /* AppDelegate.swift */; };
 		FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8320E11C550081E9F8 /* ViewController.swift */; };
 		FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8520E11C550081E9F8 /* Main.storyboard */; };
 		FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8820E11C560081E9F8 /* Assets.xcassets */; };
 		FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */; };
-		FC3602C82108580600FACB58 /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602C72108580600FACB58 /* MetalHelper.swift */; };
-		FC918191211DBC3500B6F354 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FC918190211DBC3500B6F354 /* paddle-mobile.png */; };
-		FC918193211DC70500B6F354 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FC918192211DC70500B6F354 /* iphone.JPG */; };
-		FCD04E6320F3146B0007374F /* params in Resources */ = {isa = PBXBuildFile; fileRef = FCD04E6120F3146A0007374F /* params */; };
-		FCD04E6420F3146B0007374F /* model in Resources */ = {isa = PBXBuildFile; fileRef = FCD04E6220F3146A0007374F /* model */; };
-		FCDFD3FB211D72C3005AB38B /* ModelHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDFD3FA211D72C3005AB38B /* ModelHelper.swift */; };
-		FCDFD41B211D91C7005AB38B /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD41A211D91C7005AB38B /* synset.txt */; };
+		FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; };
+		FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCC214D27920094B8E5 /* VideoCapture.swift */; };
+		FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC542122EF5400D94F7E /* MetalHelper.swift */; };
 		FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; };
 		FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
-		FCEEE7D4210627A000444BEC /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FCEEE7D3210627A000444BEC /* banana.jpeg */; };
+		FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */; };
+		FCFE9B692152858600DECA15 /* hand.jpg.zip in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B632152858600DECA15 /* hand.jpg.zip */; };
+		FCFE9B6A2152858600DECA15 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B642152858600DECA15 /* synset.txt */; };
+		FCFE9B6B2152858600DECA15 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B652152858600DECA15 /* banana.jpeg */; };
+		FCFE9B6C2152858600DECA15 /* hand.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B662152858600DECA15 /* hand.jpg */; };
+		FCFE9B6D2152858600DECA15 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B672152858600DECA15 /* iphone.JPG */; };
+		FCFE9B6E2152858600DECA15 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B682152858600DECA15 /* paddle-mobile.png */; };
+		FCFE9C512152859600DECA15 /* genet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B752152859500DECA15 /* genet_params */; };
+		FCFE9C522152859600DECA15 /* genet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B762152859500DECA15 /* genet_model */; };
+		FCFE9D232152859600DECA15 /* ar_model in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9C4C2152859500DECA15 /* ar_model */; };
+		FCFE9D242152859600DECA15 /* ar_params in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9C4D2152859500DECA15 /* ar_params */; };
 /* End PBXBuildFile section */

 /* Begin PBXCopyFilesBuildPhase section */
@@ -44,6 +51,8 @@
 		081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.release.xcconfig"; sourceTree = "<group>"; };
 		18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_demo.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.debug.xcconfig"; sourceTree = "<group>"; };
+		C2E67E5C21524E460013F575 /* LoadPointerViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LoadPointerViewController.h; sourceTree = "<group>"; };
+		C2E67E5D21524E460013F575 /* LoadPointerViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = LoadPointerViewController.m; sourceTree = "<group>"; };
 		FC013927210204A3008100E3 /* PreProcessKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreProcessKernel.metal; sourceTree = "<group>"; };
 		FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-demo.app"; sourceTree = BUILT_PRODUCTS_DIR; };
 		FC039B8120E11C550081E9F8 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
@@ -52,15 +61,23 @@
 		FC039B8820E11C560081E9F8 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		FC039B8B20E11C560081E9F8 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
 		FC039B8D20E11C560081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		FC3602C72108580600FACB58 /* MetalHelper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = MetalHelper.swift; path = "../../paddle-mobile-unit-test/paddle-mobile-unit-test/MetalHelper.swift"; sourceTree = "<group>"; };
-		FC918190211DBC3500B6F354 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
-		FC918192211DC70500B6F354 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
-		FCD04E6120F3146A0007374F /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
-		FCD04E6220F3146A0007374F /* model */ = {isa = PBXFileReference; lastKnownFileType = file; path = model; sourceTree = "<group>"; };
-		FCDFD3FA211D72C3005AB38B /* ModelHelper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelHelper.swift; sourceTree = "<group>"; };
-		FCDFD41A211D91C7005AB38B /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "paddle-mobile-demo-Bridging-Header.h"; sourceTree = "<group>"; };
+		FC4FD97B2140EE250073E130 /* libc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libc++.tbd"; path = "usr/lib/libc++.tbd"; sourceTree = SDKROOT; };
+		FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = "<group>"; };
+		FC803BCC214D27920094B8E5 /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = "<group>"; };
+		FCBCCC542122EF5400D94F7E /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalHelper.swift; sourceTree = "<group>"; };
 		FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCEEE7D3210627A000444BEC /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultiPredictViewController.swift; sourceTree = "<group>"; };
+		FCFE9B632152858600DECA15 /* hand.jpg.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = hand.jpg.zip; sourceTree = "<group>"; };
+		FCFE9B642152858600DECA15 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FCFE9B652152858600DECA15 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FCFE9B662152858600DECA15 /* hand.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = hand.jpg; sourceTree = "<group>"; };
+		FCFE9B672152858600DECA15 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
+		FCFE9B682152858600DECA15 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
+		FCFE9B752152859500DECA15 /* genet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_params; sourceTree = "<group>"; };
+		FCFE9B762152859500DECA15 /* genet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_model; sourceTree = "<group>"; };
+		FCFE9C4C2152859500DECA15 /* ar_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = ar_model; sourceTree = "<group>"; };
+		FCFE9C4D2152859500DECA15 /* ar_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = ar_params; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -88,6 +105,7 @@
 		7B7DED984E9EE7BFB45E24E8 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				FC4FD97B2140EE250073E130 /* libc++.tbd */,
 				18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */,
 			);
 			name = Frameworks;
@@ -115,49 +133,82 @@
 		FC039B8020E11C550081E9F8 /* paddle-mobile-demo */ = {
 			isa = PBXGroup;
 			children = (
-				FC0E2C2020EDC03B009C1FAC /* models */,
-				FC0E2C1D20EDC030009C1FAC /* images */,
+				FCFE9B6F2152859500DECA15 /* models */,
+				FCFE9B622152858600DECA15 /* images */,
+				FC803BCA214D27920094B8E5 /* VideoCapture */,
+				FC8CFED2213519540094D569 /* Net */,
 				FC039B8120E11C550081E9F8 /* AppDelegate.swift */,
-				FC013927210204A3008100E3 /* PreProcessKernel.metal */,
 				FC039B8320E11C550081E9F8 /* ViewController.swift */,
 				FC039B8520E11C550081E9F8 /* Main.storyboard */,
 				FC039B8820E11C560081E9F8 /* Assets.xcassets */,
 				FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */,
 				FC039B8D20E11C560081E9F8 /* Info.plist */,
-				FC3602C72108580600FACB58 /* MetalHelper.swift */,
-				FCDFD3FA211D72C3005AB38B /* ModelHelper.swift */,
+				FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */,
+				FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */,
+				C2E67E5C21524E460013F575 /* LoadPointerViewController.h */,
+				C2E67E5D21524E460013F575 /* LoadPointerViewController.m */,
 			);
 			path = "paddle-mobile-demo";
 			sourceTree = "<group>";
 		};
-		FC0E2C1D20EDC030009C1FAC /* images */ = {
+		FC803BCA214D27920094B8E5 /* VideoCapture */ = {
+			isa = PBXGroup;
+			children = (
+				FC803BCB214D27920094B8E5 /* FPSCounter.swift */,
+				FC803BCC214D27920094B8E5 /* VideoCapture.swift */,
+			);
+			path = VideoCapture;
+			sourceTree = "<group>";
+		};
+		FC8CFED2213519540094D569 /* Net */ = {
+			isa = PBXGroup;
+			children = (
+				FC013927210204A3008100E3 /* PreProcessKernel.metal */,
+				FCBCCC542122EF5400D94F7E /* MetalHelper.swift */,
+			);
+			path = Net;
+			sourceTree = "<group>";
+		};
+		FCFE9B622152858600DECA15 /* images */ = {
 			isa = PBXGroup;
 			children = (
-				FC918192211DC70500B6F354 /* iphone.JPG */,
-				FC918190211DBC3500B6F354 /* paddle-mobile.png */,
-				FCDFD41A211D91C7005AB38B /* synset.txt */,
-				FCEEE7D3210627A000444BEC /* banana.jpeg */,
+				FCFE9B632152858600DECA15 /* hand.jpg.zip */,
+				FCFE9B642152858600DECA15 /* synset.txt */,
+				FCFE9B652152858600DECA15 /* banana.jpeg */,
+				FCFE9B662152858600DECA15 /* hand.jpg */,
+				FCFE9B672152858600DECA15 /* iphone.JPG */,
+				FCFE9B682152858600DECA15 /* paddle-mobile.png */,
 			);
 			name = images;
 			path = ../../images;
 			sourceTree = "<group>";
 		};
-		FC0E2C2020EDC03B009C1FAC /* models */ = {
+		FCFE9B6F2152859500DECA15 /* models */ = {
 			isa = PBXGroup;
 			children = (
-				FCD04E6020F3146A0007374F /* mobilenet */,
+				FCFE9B742152859500DECA15 /* genet */,
+				FCFE9C4B2152859500DECA15 /* fluid_fssd_new_ar */,
 			);
 			name = models;
 			path = ../../models;
 			sourceTree = "<group>";
 		};
-		FCD04E6020F3146A0007374F /* mobilenet */ = {
+		FCFE9B742152859500DECA15 /* genet */ = {
+			isa = PBXGroup;
+			children = (
+				FCFE9B752152859500DECA15 /* genet_params */,
+				FCFE9B762152859500DECA15 /* genet_model */,
+			);
+			path = genet;
+			sourceTree = "<group>";
+		};
+		FCFE9C4B2152859500DECA15 /* fluid_fssd_new_ar */ = {
 			isa = PBXGroup;
 			children = (
-				FCD04E6120F3146A0007374F /* params */,
-				FCD04E6220F3146A0007374F /* model */,
+				FCFE9C4C2152859500DECA15 /* ar_model */,
+				FCFE9C4D2152859500DECA15 /* ar_params */,
 			);
-			path = mobilenet;
+			path = fluid_fssd_new_ar;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
@@ -195,6 +246,7 @@
 				TargetAttributes = {
 					FC039B7D20E11C550081E9F8 = {
 						CreatedOnToolsVersion = 9.3.1;
+						LastSwiftMigration = 0940;
 					};
 				};
 			};
@@ -221,14 +273,18 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				FCD04E6320F3146B0007374F /* params in Resources */,
+				FCFE9D232152859600DECA15 /* ar_model in Resources */,
 				FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */,
-				FC918191211DBC3500B6F354 /* paddle-mobile.png in Resources */,
+				FCFE9C522152859600DECA15 /* genet_model in Resources */,
+				FCFE9D242152859600DECA15 /* ar_params in Resources */,
+				FCFE9B6E2152858600DECA15 /* paddle-mobile.png in Resources */,
+				FCFE9C512152859600DECA15 /* genet_params in Resources */,
+				FCFE9B692152858600DECA15 /* hand.jpg.zip in Resources */,
 				FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */,
-				FCEEE7D4210627A000444BEC /* banana.jpeg in Resources */,
-				FC918193211DC70500B6F354 /* iphone.JPG in Resources */,
-				FCDFD41B211D91C7005AB38B /* synset.txt in Resources */,
-				FCD04E6420F3146B0007374F /* model in Resources */,
+				FCFE9B6A2152858600DECA15 /* synset.txt in Resources */,
+				FCFE9B6B2152858600DECA15 /* banana.jpeg in Resources */,
+				FCFE9B6D2152858600DECA15 /* iphone.JPG in Resources */,
+				FCFE9B6C2152858600DECA15 /* hand.jpg in Resources */,
 				FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -280,10 +336,13 @@
 			buildActionMask = 2147483647;
 			files = (
 				FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */,
-				FCDFD3FB211D72C3005AB38B /* ModelHelper.swift in Sources */,
+				FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */,
 				FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */,
+				FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */,
+				FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */,
+				FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */,
+				C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */,
 				FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */,
-				FC3602C82108580600FACB58 /* MetalHelper.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -428,19 +487,23 @@
 			baseConfigurationReference = 878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = A798K58VVL;
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.paddlemobile.metal;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				PROVISIONING_PROFILE = "";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 4.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -451,19 +514,22 @@
 			baseConfigurationReference = 081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = A798K58VVL;
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.paddlemobile.metal;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				PROVISIONING_PROFILE = "";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
 				SWIFT_VERSION = 4.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};

--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile-demo.xcscheme
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "0940"
-   version = "1.3">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-      <BuildActionEntries>
-         <BuildActionEntry
-            buildForTesting = "YES"
-            buildForRunning = "YES"
-            buildForProfiling = "YES"
-            buildForArchiving = "YES"
-            buildForAnalyzing = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-               BuildableName = "paddle-mobile-demo.app"
-               BlueprintName = "paddle-mobile-demo"
-               ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-            </BuildableReference>
-         </BuildActionEntry>
-      </BuildActionEntries>
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES">
-      <Testables>
-      </Testables>
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-            BuildableName = "paddle-mobile-demo.app"
-            BlueprintName = "paddle-mobile-demo"
-            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-            BuildableName = "paddle-mobile-demo.app"
-            BlueprintName = "paddle-mobile-demo"
-            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-      <BuildableProductRunnable
-         runnableDebuggingMode = "0">
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
-            BuildableName = "paddle-mobile-demo.app"
-            BlueprintName = "paddle-mobile-demo"
-            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
-         </BuildableReference>
-      </BuildableProductRunnable>
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>SchemeUserState</key>
-	<dict>
-		<key>paddle-mobile-demo.xcscheme</key>
-		<dict>
-			<key>orderHint</key>
-			<integer>2</integer>
-		</dict>
-	</dict>
-	<key>SuppressBuildableAutocreation</key>
-	<dict>
-		<key>FC039B7D20E11C550081E9F8</key>
-		<dict>
-			<key>primary</key>
-			<true/>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
@@ -19,7 +19,6 @@ class AppDelegate: UIResponder, UIApplicationDelegate {

    var window: UIWindow?

-
    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
        // Override point for customization after application launch.
        return true

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
@@ -11,6 +11,34 @@
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
    </dependencies>
    <scenes>
+        <!--Multi Predict View Controller-->
+        <scene sceneID="ec4-AW-9Vs">
+            <objects>
+                <viewController id="Vwd-lt-764" customClass="MultiPredictViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="55D-rz-Ex6">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="TQt-X9-PdF">
+                                <rect key="frame" x="164" y="318" width="46" height="30"/>
+                                <state key="normal" title="Button"/>
+                                <connections>
+                                    <action selector="predictAct:" destination="Vwd-lt-764" eventType="touchUpInside" id="d4z-Cv-6jY"/>
+                                </connections>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerY" secondItem="55D-rz-Ex6" secondAttribute="centerY" id="bL3-wr-TcH"/>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerX" secondItem="55D-rz-Ex6" secondAttribute="centerX" id="sBi-RQ-sJn"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="bsd-h4-RYZ"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="68E-SG-96s" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-559" y="686"/>
+        </scene>
        <!--View Controller-->
        <scene sceneID="tne-QT-ifu">
            <objects>
@@ -20,12 +48,11 @@
                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                        <subviews>
                            <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
-                                <rect key="frame" x="0.0" y="20" width="375" height="247"/>
+                                <rect key="frame" x="0.0" y="20" width="225" height="247"/>
                            </imageView>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Thread:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
-                                <rect key="frame" x="10" y="538" width="68" height="24"/>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" horizontalCompressionResistancePriority="749" text="Platform:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
+                                <rect key="frame" x="10" y="538" width="35" height="24"/>
                                <constraints>
-                                    <constraint firstAttribute="width" constant="68" id="Q5J-tq-JSX"/>
                                    <constraint firstAttribute="height" constant="24" id="SYv-As-Si8"/>
                                </constraints>
                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
@@ -33,12 +60,12 @@
                                <nil key="highlightedColor"/>
                            </label>
                            <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="DlO-dk-RMr">
-                                <rect key="frame" x="88" y="510.5" width="287" height="80"/>
+                                <rect key="frame" x="55" y="510.5" width="320" height="80"/>
                                <constraints>
                                    <constraint firstAttribute="height" constant="80" id="Sbi-05-Mwd"/>
                                </constraints>
                            </pickerView>
-                            <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5">
+                            <pickerView contentMode="scaleToFill" horizontalCompressionResistancePriority="749" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5">
                                <rect key="frame" x="85" y="401" width="290" height="80"/>
                                <constraints>
                                    <constraint firstAttribute="height" constant="80" id="yAL-JY-G6b"/>
@@ -47,7 +74,6 @@
                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Models" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="avL-VK-Kha">
                                <rect key="frame" x="10" y="429" width="65" height="24"/>
                                <constraints>
-                                    <constraint firstAttribute="width" constant="65" id="6oA-g2-Xq4"/>
                                    <constraint firstAttribute="height" constant="24" id="EwE-B3-z2R"/>
                                </constraints>
                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
@@ -142,9 +168,14 @@
                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Cil-py-NiA">
+                                <rect key="frame" x="225" y="20" width="150" height="247"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </view>
                        </subviews>
                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                        <constraints>
+                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="Cil-py-NiA" secondAttribute="bottom" constant="10" id="16p-IK-b5X"/>
                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="VQn-bS-fWp" secondAttribute="trailing" constant="10" id="1Xg-0h-9SE"/>
                            <constraint firstItem="avL-VK-Kha" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="2t9-hS-VXa"/>
                            <constraint firstItem="R90-Yf-S6g" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="76b-Ny-1Og"/>
@@ -159,11 +190,12 @@
                            <constraint firstItem="XpL-9M-UOp" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KWW-qT-Rzf"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="centerY" secondItem="avL-VK-Kha" secondAttribute="centerY" id="KZa-YZ-DEs"/>
                            <constraint firstItem="2EB-m2-a3L" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="Le3-TN-zOL"/>
-                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="MeS-HQ-voE"/>
+                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" constant="-150" id="MeS-HQ-voE"/>
                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="ZZh-fw-LwK" secondAttribute="bottom" constant="10" id="NUL-Ta-VI8"/>
                            <constraint firstItem="m5L-O7-P31" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="RFA-z1-9aB"/>
                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="a3K-ri-NVs" secondAttribute="width" id="Rp6-Bh-BN3"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="S0W-0G-75m"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="UNc-Et-9Yv"/>
                            <constraint firstItem="w7H-Sk-Rai" firstAttribute="leading" secondItem="wUL-9N-u1V" secondAttribute="trailing" id="VBM-8b-jP0"/>
                            <constraint firstItem="VQn-bS-fWp" firstAttribute="top" secondItem="m5L-O7-P31" secondAttribute="bottom" constant="8" id="VpS-4N-mOo"/>
                            <constraint firstItem="wUL-9N-u1V" firstAttribute="top" secondItem="2EB-m2-a3L" secondAttribute="bottom" constant="35" id="VpU-j2-gaE"/>
@@ -175,10 +207,12 @@
                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="eIC-fZ-OEE"/>
                            <constraint firstItem="976-fk-Kx2" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="fFg-pB-eyU"/>
                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="40" id="fG6-0p-I0P"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="gGK-DB-ibv"/>
                            <constraint firstItem="XpL-9M-UOp" firstAttribute="leading" secondItem="w7H-Sk-Rai" secondAttribute="trailing" id="guC-Db-cA9"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="leading" secondItem="avL-VK-Kha" secondAttribute="trailing" constant="10" id="jNW-iC-u7V"/>
                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="bottom" secondItem="6Tk-OE-BBY" secondAttribute="bottom" id="o1X-q5-P7j"/>
                            <constraint firstItem="6MG-gv-hD5" firstAttribute="top" secondItem="VQn-bS-fWp" secondAttribute="bottom" constant="8" id="tAE-ss-jlA"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="leading" secondItem="ZZh-fw-LwK" secondAttribute="trailing" id="teJ-PP-h2R"/>
                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="top" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="10" id="udc-wT-jqd"/>
                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" id="vXI-l2-CjL"/>
                            <constraint firstItem="VQn-bS-fWp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="wtI-Dl-YPq"/>
@@ -195,11 +229,81 @@
                        <outlet property="resultTextView" destination="VQn-bS-fWp" id="306-c7-3vM"/>
                        <outlet property="selectImageView" destination="ZZh-fw-LwK" id="afR-Bv-6AW"/>
                        <outlet property="threadPickerView" destination="DlO-dk-RMr" id="Kk4-QV-b5o"/>
+                        <outlet property="videoView" destination="Cil-py-NiA" id="QY2-BP-SNS"/>
                    </connections>
                </viewController>
                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
            </objects>
-            <point key="canvasLocation" x="-724" y="98.50074962518741"/>
+            <point key="canvasLocation" x="-1449" y="-3"/>
+        </scene>
+        <!--Load Pointer View Controller-->
+        <scene sceneID="56v-9i-I4d">
+            <objects>
+                <viewController id="4MS-jc-i6A" customClass="LoadPointerViewController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="VbZ-nk-rJR">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="2p5-S3-M4T">
+                                <rect key="frame" x="16" y="63" width="240" height="128"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="37q-nm-0H7">
+                                <rect key="frame" x="38" y="610" width="42" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="ofW-G3-KST"/>
+                                    <constraint firstAttribute="width" constant="42" id="pwd-tO-zcJ"/>
+                                </constraints>
+                                <state key="normal" title="Image"/>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="fAg-ai-yaA">
+                                <rect key="frame" x="119" y="610" width="34" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="IES-jf-Z1n"/>
+                                    <constraint firstAttribute="width" constant="34" id="jxK-Xn-WCE"/>
+                                </constraints>
+                                <state key="normal" title="Load"/>
+                                <connections>
+                                    <action selector="loaderButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="3cy-PD-aiE"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="pdS-6e-Pd1">
+                                <rect key="frame" x="185" y="610" width="49" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="49" id="ddY-uM-fzA"/>
+                                    <constraint firstAttribute="height" constant="30" id="yKd-YL-UML"/>
+                                </constraints>
+                                <state key="normal" title="Predict"/>
+                                <connections>
+                                    <action selector="predictButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="sOH-iT-s1w"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="DZa-sd-lY7">
+                                <rect key="frame" x="279" y="610" width="34" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="34" id="aSO-4q-PgA"/>
+                                    <constraint firstAttribute="height" constant="30" id="eAt-Uc-BxX"/>
+                                </constraints>
+                                <state key="normal" title="clear"/>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="37q-nm-0H7" secondAttribute="bottom" constant="27" id="4Wf-Uh-gvr"/>
+                            <constraint firstItem="DZa-sd-lY7" firstAttribute="leading" secondItem="pdS-6e-Pd1" secondAttribute="trailing" constant="45" id="8dB-uI-cs9"/>
+                            <constraint firstItem="fAg-ai-yaA" firstAttribute="leading" secondItem="37q-nm-0H7" secondAttribute="trailing" constant="39" id="EAV-Oq-jeD"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="fAg-ai-yaA" secondAttribute="bottom" constant="27" id="Px0-A9-Eql"/>
+                            <constraint firstItem="pdS-6e-Pd1" firstAttribute="leading" secondItem="fAg-ai-yaA" secondAttribute="trailing" constant="32" id="ZUR-Nv-aNb"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="pdS-6e-Pd1" secondAttribute="bottom" constant="27" id="kPx-mt-ab9"/>
+                            <constraint firstItem="37q-nm-0H7" firstAttribute="leading" secondItem="vsb-FH-h7h" secondAttribute="leading" constant="38" id="trH-Fq-sSv"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="DZa-sd-lY7" secondAttribute="bottom" constant="27" id="yNJ-hq-2Qg"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="vsb-FH-h7h"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="hGb-Pb-icS" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-721" y="-427"/>
        </scene>
    </scenes>
    <resources>

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
+//
+//  LoadPointerViewController.h
+//  paddle-mobile-demo
+//
+//  Created by Xiao,Haichun on 2018/9/19.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+#import <UIKit/UIKit.h>
+
+@interface LoadPointerViewController : UIViewController
+
+@end
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
+//
+//  LoadPointerViewController.m
+//  paddle-mobile-demo
+//
+//  Created by Xiao,Haichun on 2018/9/19.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+#import "LoadPointerViewController.h"
+#import <Metal/Metal.h>
+#import "paddle-mobile-demo-Bridging-Header.h"
+
+@interface LoadPointerViewController ()
+
+@property (strong, nonatomic) id<MTLDevice> device;
+@property (strong, nonatomic) id<MTLTexture> texture;
+@property (strong, nonatomic) id<MTLCommandQueue> queue;
+@property (strong, nonatomic) PaddleMobileGPU *runner;
+@property (strong, nonatomic) ModelConfig *modelConfig;
+
+@end
+
+@implementation LoadPointerViewController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+  
+  
+  self.device = MTLCreateSystemDefaultDevice();
+  
+  self.queue = [self.device newCommandQueue];
+  
+    // Do any additional setup after loading the view.
+//  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"genet_model" withExtension:nil].path;
+//  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"genet_params" withExtension:nil].path;
+  
+  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"ar_model" withExtension:nil].path;
+  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"ar_params" withExtension:nil].path;
+
+  long fileSize;
+  FILE *fp;
+  fp = fopen([modelPath UTF8String], "rb");
+  fseek(fp, 0, SEEK_END);
+  fileSize = ftell(fp);
+  rewind(fp);
+  void *buffer = malloc(fileSize);
+  fread(buffer, 1, fileSize, fp);
+  fclose(fp);
+  
+  long paramfileSize;
+  FILE *parmaFilePointer;
+  parmaFilePointer = fopen([paramPath UTF8String], "rb");
+  fseek(parmaFilePointer, 0, SEEK_END);
+  paramfileSize = ftell(parmaFilePointer);
+  rewind(parmaFilePointer);
+  void *parmaBuffer = malloc(paramfileSize);
+  fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
+  fclose(parmaFilePointer);
+  
+  _modelConfig = [[ModelConfig alloc] init];
+//  _modelConfig.means = @[[NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0]];
+//  _modelConfig.scale = 0.017;
+//  _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:128.], [NSNumber numberWithFloat:128.0],[NSNumber numberWithFloat:3.0]];
+  _modelConfig.means = @[[NSNumber numberWithFloat:103.94], [NSNumber numberWithFloat:116.78], [NSNumber numberWithFloat:123.68]];
+  _modelConfig.scale = 1;
+  _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:160.], [NSNumber numberWithFloat:160.0],[NSNumber numberWithFloat:3.0]];
+  _modelConfig.modelPointer = buffer;
+  _modelConfig.modelSize = (int)fileSize;
+  _modelConfig.paramPointer = parmaBuffer;
+  _modelConfig.paramSize = (int)paramfileSize;
+}
+- (IBAction)loaderButtonPressed:(id)sender {
+//  _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:GenetType modelConfig:_modelConfig];
+  _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:MobileNetSSDType modelConfig:_modelConfig];
+  
+  [_runner load];
+}
+- (IBAction)predictButtonPressed:(id)sender {
+  [self predict];
+}
+
+- (id<MTLTexture>) createTextureFromImage:(UIImage*) image device:(id<MTLDevice>) device
+{
+  image  =[UIImage imageWithCGImage:[image CGImage]
+                              scale:[image scale]
+                        orientation: UIImageOrientationLeft];
+  
+  NSLog(@"orientation and size and stuff %ld %f %f", (long)image.imageOrientation, image.size.width, image.size.height);
+  
+  CGImageRef imageRef = image.CGImage;
+  
+  size_t width = self.view.frame.size.width;
+  size_t height = self.view.frame.size.height;
+  
+  size_t bitsPerComponent = CGImageGetBitsPerComponent(imageRef);
+  size_t bitsPerPixel = CGImageGetBitsPerPixel(imageRef);
+  
+  CGColorSpaceRef colorSpace = CGImageGetColorSpace(imageRef);
+  
+  CGImageAlphaInfo alphaInfo = CGImageGetAlphaInfo(imageRef);
+  
+  //  NSLog(@"%@ %u", colorSpace, alphaInfo);
+  
+  CGBitmapInfo bitmapInfo = kCGBitmapByteOrderDefault | alphaInfo;
+  //    NSLog(@"bitmap info %u", bitmapInfo);
+  
+  
+  CGContextRef context = CGBitmapContextCreate( NULL, width, height, bitsPerComponent, (bitsPerPixel / 8) * width, colorSpace, bitmapInfo);
+  
+  if( !context )
+  {
+    NSLog(@"Failed to load image, probably an unsupported texture type");
+    return nil;
+  }
+  
+  CGContextDrawImage( context, CGRectMake( 0, 0, width, height ), image.CGImage);
+  
+  
+  MTLPixelFormat format = MTLPixelFormatRGBA8Unorm;
+  
+  MTLTextureDescriptor *texDesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
+                                                                                     width:width
+                                                                                    height:height
+                                                                                 mipmapped:NO];
+  id<MTLTexture> texture = [device newTextureWithDescriptor:texDesc];
+  
+  [texture replaceRegion:MTLRegionMake2D(0, 0, width, height)
+             mipmapLevel:0
+               withBytes:CGBitmapContextGetData(context)
+             bytesPerRow:4 * width];
+  
+  return texture;
+}
+
+- (void)predict {
+  _texture = [self createTextureFromImage:[UIImage imageNamed:@"hand.jpg"] device:self.device];
+  NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
+  NSInteger max = 428;
+  for (int i = 0;i < max; i ++) {
+    [_runner predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
+      if (success) {
+        if (i == max -1) {
+          double time = [[NSDate date] timeIntervalSince1970] - startTime;
+          time = (time/max)*1000;
+          NSLog(@"gap ==== %fms",time);
+        }
+//        for (int i = 0; i < result.count; i ++) {
+//          NSNumber *number = result[i];
+//          NSLog(@"result %d = %f:",i, [number floatValue]);
+//        }
+      }
+    }];
+  }
+}
+
+- (void)didReceiveMemoryWarning {
+    [super didReceiveMemoryWarning];
+    // Dispose of any resources that can be recreated.
+}
+
+/*
+#pragma mark - Navigation
+
+// In a storyboard-based application, you will often want to do a little preparation before navigation
+- (void)prepareForSegue:(UIStoryboardSegue *)segue sender:(id)sender {
+    // Get the new view controller using [segue destinationViewController].
+    // Pass the selected object to the new view controller.
+}
+*/
+
+@end
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
-//
-//  MetalHelper.swift
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/7/25.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-import Metal
-import MetalKit
-import Foundation
-import paddle_mobile
-import MetalPerformanceShaders
-
-class MetalHelper {
-    let device: MTLDevice
-    let queue: MTLCommandQueue
-    let textureLoader: MTKTextureLoader
-    static let shared: MetalHelper = MetalHelper.init()
-    private init(){
-        device = MTLCreateSystemDefaultDevice()!
-        queue = device.makeCommandQueue()!
-        textureLoader = MTKTextureLoader.init(device: device)
-    }
-    
-    static func scaleTexture(queue: MTLCommandQueue, input: MTLTexture, size:(width: Int, height: Int), complete: @escaping (MTLTexture) -> Void) {
-        let tmpTextureDes = MTLTextureDescriptor.init()
-        tmpTextureDes.width = size.width
-        tmpTextureDes.height = size.height
-        tmpTextureDes.depth = 1
-        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-        tmpTextureDes.pixelFormat = .rgba32Float
-        tmpTextureDes.textureType = .type2D
-        tmpTextureDes.storageMode = .shared
-        tmpTextureDes.cpuCacheMode = .defaultCache
-        let dest = MetalHelper.shared.device.makeTexture(descriptor: tmpTextureDes)
-        
-        let scale = MPSImageLanczosScale.init(device: MetalHelper.shared.device)
-        
-        let buffer = queue.makeCommandBuffer()
-        scale.encode(commandBuffer: buffer!, sourceTexture: input, destinationTexture: dest!)
-        buffer?.addCompletedHandler({ (buffer) in
-            complete(dest!)
-        })
-        buffer?.commit()
-    }
-}
-
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
-//
-//  ModelHelper.swift
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-import UIKit
-import MetalKit
-import Foundation
-import paddle_mobile
-import MetalPerformanceShaders
-
-class PreProccess: CusomKernel {
-    init(device: MTLDevice) {
-        let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-        super.init(device: device, inFunctionName: "preprocess", outputDim: s, usePaddleMobileLib: false)
-    }
-}
-
-let modelHelperMap: [SupportModel : ModelHelper] = [.mobilenet : MobileNetHelper.init()]
-
-enum SupportModel: String{
-    case mobilenet = "mobilenet"
-    static func supportedModels() -> [SupportModel] {
-        return [.mobilenet]
-    }
-}
-
-protocol ModelHelper {
-    var dim: [Int] { get }
-    var modelPath: String { get }
-    var paramPath: String { get }
-    var modelDir: String { get }
-    var preprocessKernel: CusomKernel { get }
-    func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void)
-    func resultStr(res: [Float]) -> String
-}
-
-extension ModelHelper {
-    func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
-        let texture = try? MetalHelper.shared.textureLoader.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
-        MetalHelper.scaleTexture(queue: MetalHelper.shared.queue, input: texture!, size: (224, 224)) { (resTexture) in
-            getTexture(resTexture)
-        }
-    }
-}
-
-struct MobileNetHelper: ModelHelper{
-    class PreWords {
-        var contents: [String] = []
-        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-                let string = try! String.init(contentsOfFile: filePath)
-                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
-                }
-            }else{
-                fatalError("no file call \(fileName)")
-            }
-        }
-        subscript(index: Int) -> String{
-            return contents[index]
-        }
-    }
-    let labels = PreWords.init(fileName: "synset")
-    
-    func resultStr(res: [Float]) -> String {
-        var s: [String] = []
-        res.top(r: 5).enumerated().forEach{
-            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
-        }
-        return s.joined(separator: "\n")
-    }
-    
-    var preprocessKernel: CusomKernel
-    let dim = [1, 224, 224, 3]
-    let modelPath: String
-    let paramPath: String
-    let modelDir: String
-    
-    init() {
-        modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
-        paramPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
-        modelDir = ""
-        preprocessKernel = PreProccess.init(device: MetalHelper.shared.device)
-    }
-}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import UIKit
+import paddle_mobile
+
+class MultiPredictViewController: UIViewController {
+  var runner1: Runner!
+  var runner2: Runner!
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
+    let genet = Genet.init(device: MetalHelper.shared.device)
+    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    let queue2 = MetalHelper.shared.device.makeCommandQueue()
+    
+    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+  }
+
+  @IBAction func predictAct(_ sender: Any) {
+    let success = self.runner2.load()
+//    DispatchQueue.global().async {
+      let image1 = UIImage.init(named: "hand.jpg")
+//      let success = self.runner2.load()
+//      if success {
+//        for i in 0..<10000 {
+//          print(i)
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result1: ")
+////            print(res)
+//          })
+//        }
+//      } else {
+//        print("load failed")
+//      }
+//      self.runner1.clear()
+//    }
+//    return
+//    DispatchQueue.global().async {
+////      sleep(1)
+//      let image1 = UIImage.init(named: "banana.jpeg")
+////      if success {
+//        for _ in 0..<10 {
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result2: ")
+//            print(res)
+//          })
+//        }
+////      } else {
+////        print("load failed")
+////      }
+////      self.runner2.clear()
+//    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import MetalKit
+import Foundation
+import paddle_mobile
+
+class MetalHelper {
+  let device: MTLDevice
+  let queue: MTLCommandQueue
+  let textureLoader: MTKTextureLoader
+  static let shared: MetalHelper = MetalHelper.init()
+  private init(){
+    device = MTLCreateSystemDefaultDevice()!
+    queue = device.makeCommandQueue()!
+    textureLoader = MTKTextureLoader.init(device: device)
+  }
+  
+
+}
+
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
+//
+//  PaddleMobile.swift
+//  paddle-mobile-demo
+//
+//  Created by liuRuiLong on 2018/9/5.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+import Foundation
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void mobilenet_preprocess(
+                       texture2d<float, access::read> inTexture [[texture(0)]],
+                       texture2d<float, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_preprocess_half(
+                       texture2d<half, access::read> inTexture [[texture(0)]],
+                       texture2d<half, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_ssd_preprocess(
+                       texture2d<float, access::read> inTexture [[texture(0)]],
+                       texture2d<float, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_ssd_preprocess_half(
+                            texture2d<half, access::read> inTexture [[texture(0)]],
+                            texture2d<half, access::write> outTexture [[texture(1)]],
+                            uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void genet_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilent_ar_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilent_ar_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(input, gid);
+}
+
+kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(half4(input), gid);
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
-//
-//  PreProcessKernel.metal
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/7/20.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-kernel void preprocess(
-                       texture2d<float, access::read> inTexture [[texture(0)]],
-                       texture2d<float, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void preprocess_half(
-                       texture2d<half, access::read> inTexture [[texture(0)]],
-                       texture2d<half, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
-    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-
-
-
-
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
+
+
+import Foundation
+import QuartzCore
+
+public class FPSCounter {
+  private(set) public var fps: Double = 0
+
+  var frames = 0
+  var startTime: CFTimeInterval = 0
+
+  public func start() {
+    frames = 0
+    startTime = CACurrentMediaTime()
+  }
+
+  public func frameCompleted() {
+    frames += 1
+    let now = CACurrentMediaTime()
+    let elapsed = now - startTime
+    if elapsed > 0.1 {
+      let current = Double(frames) / elapsed
+      let smoothing = 0.75
+      fps = smoothing*fps + (1 - smoothing)*current
+      if elapsed > 1 {
+        frames = 0
+        startTime = CACurrentMediaTime()
+      }
+    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+
+import UIKit
+import Metal
+import CoreVideo
+import AVFoundation
+
+@available(iOS 10.0, *)
+@objc public protocol VideoCaptureDelegate: NSObjectProtocol {
+  @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
+}
+
+/**
+  Simple interface to the iPhone's camera.
+*/
+@available(iOS 10.0, *)
+public class VideoCapture: NSObject {
+    public var previewLayer: AVCaptureVideoPreviewLayer?
+    public weak var delegate: VideoCaptureDelegate?
+    public var fps = -1
+    private let device: MTLDevice?
+    private let videoOrientation: AVCaptureVideoOrientation
+    private var textureCache: CVMetalTextureCache?
+    private let captureSession = AVCaptureSession()
+    private let videoOutput = AVCaptureVideoDataOutput()
+    private let photoOutput = AVCapturePhotoOutput()
+    private let queue = DispatchQueue(label: "net.machinethink.camera-queue")
+    private var lastTimestamp = CMTime()
+    private let cameraPosition: AVCaptureDevice.Position
+    public init(device: MTLDevice? = nil, orientation: AVCaptureVideoOrientation = .portrait, position: AVCaptureDevice.Position = .back) {
+        self.device = device
+        self.videoOrientation = orientation
+        self.cameraPosition = position
+        super.init()
+    }
+
+    public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
+                    completion: @escaping (Bool) -> Void) {
+        queue.async {
+            let success = self.setUpCamera(sessionPreset: sessionPreset)
+            DispatchQueue.main.async {
+                completion(success)
+            }
+        }
+    }
+
+    func fontCamera() -> AVCaptureDevice? {
+        let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
+        return deveices.first
+        
+    }
+    
+    func setUpCamera(sessionPreset: AVCaptureSession.Preset) -> Bool {
+        if let inDevice = device{
+            guard CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, inDevice, nil, &textureCache) == kCVReturnSuccess else {
+                print("Error: could not create a texture cache")
+                return false
+            }
+        }
+        
+        captureSession.beginConfiguration()
+        captureSession.sessionPreset = sessionPreset
+
+        var oCaptureDevice: AVCaptureDevice?
+        switch cameraPosition {
+        case .back:
+            oCaptureDevice = AVCaptureDevice.default(for: AVMediaType.video)
+            break
+        case .front:
+            oCaptureDevice = fontCamera()
+            break
+        default:
+            break
+        }
+        
+        guard let captureDevice = oCaptureDevice else {
+            print("Error: no video devices available")
+            return false
+        }
+
+        guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
+            print("Error: could not create AVCaptureDeviceInput")
+            return false
+        }
+
+        if captureSession.canAddInput(videoInput) {
+            captureSession.addInput(videoInput)
+        }
+
+        let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
+        previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
+        previewLayer.connection?.videoOrientation = self.videoOrientation
+        self.previewLayer = previewLayer
+
+        let settings: [String : Any] = [
+        kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
+        ]
+
+        videoOutput.videoSettings = settings
+        videoOutput.alwaysDiscardsLateVideoFrames = true
+        videoOutput.setSampleBufferDelegate(self, queue: queue)
+        if captureSession.canAddOutput(videoOutput) {
+            captureSession.addOutput(videoOutput)
+        }
+
+        // We want the buffers to be in portrait orientation otherwise they are
+        // rotated by 90 degrees. Need to set this _after_ addOutput()!
+        videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
+
+        if captureSession.canAddOutput(photoOutput) {
+            captureSession.addOutput(photoOutput)
+        }
+
+        captureSession.commitConfiguration()
+        return true
+    }
+
+    public func start() {
+        if !captureSession.isRunning {
+            captureSession.startRunning()
+        }
+    }
+
+    public func stop() {
+        if captureSession.isRunning {
+            captureSession.stopRunning()
+        }
+    }
+
+    /* Captures a single frame of the camera input. */
+    public func capturePhoto() {
+        let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
+        settings.previewPhotoFormat = [
+            kCVPixelBufferPixelFormatTypeKey as String: settings.__availablePreviewPhotoPixelFormatTypes[0],
+            kCVPixelBufferWidthKey as String: 480,
+            kCVPixelBufferHeightKey as String: 360,
+        ]
+        photoOutput.capturePhoto(with: settings, delegate: self)
+    }
+
+    func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
+        if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            var texture: CVMetalTexture?
+            CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache, imageBuffer, nil, .bgra8Unorm, width, height, 0, &texture)
+            if let texture = texture {
+                return CVMetalTextureGetTexture(texture)
+            }
+        }
+        return nil
+    }
+
+    func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
+        if let sampleBuffer = sampleBuffer,
+            let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            let rect = CGRect(x: 0, y: 0, width: CGFloat(width), height: CGFloat(height))
+            let ciImage = CIImage(cvPixelBuffer: imageBuffer)
+            let ciContext = CIContext(options: nil)
+            if let cgImage = ciContext.createCGImage(ciImage, from: rect) {
+                return UIImage(cgImage: cgImage)
+            }
+        }
+        return nil
+    }
+}
+
+
+@available(iOS 10.0, *)
+extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
+  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    // Because lowering the capture device's FPS looks ugly in the preview,
+    // we capture at full speed but only call the delegate at its desired
+    // framerate. If `fps` is -1, we run at the full framerate.
+    let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+    let deltaTime = timestamp - lastTimestamp
+    if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
+        lastTimestamp = timestamp
+        self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
+            let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
+            delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+        }
+    }
+  }
+
+  public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    print("dropped frame")
+  }
+}
+
+@available(iOS 10.0, *)
+extension VideoCapture: AVCapturePhotoCaptureDelegate {
+  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
+                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
+                          previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
+                          resolvedSettings: AVCaptureResolvedPhotoSettings,
+                          bracketSettings: AVCaptureBracketedStillImageSettings?,
+                          error: Error?) {
+    var imageTexture: MTLTexture?
+    var previewImage: UIImage?
+    if error == nil {
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
+            imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
+        }
+        
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
+            previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+        }
+    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -14,164 +14,292 @@

 import UIKit
 import MetalKit
+import CoreMedia
 import paddle_mobile
 import MetalPerformanceShaders

-let threadSupport = [1]
+var platform: Platform = .GPU
+let threadSupport: [(Platform, String)] = [(.GPU, "GPU"), (.CPU, "CPU")]
+
+//.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+let modelHelperMap: [SupportModel : Runner] = [
+                                               .genet : Runner.init(inNet: Genet.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+                                               .mobilenet_ssd_ar : Runner.init(inNet: MobileNet_ssd_AR.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform)]
+//, .genet : Genet.init()
+//let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
+
+let netSupport: [SupportModel : Net] = [.genet : Genet.init(device: MetalHelper.shared.device), .mobilenet_ssd_ar : MobileNet_ssd_AR.init(device: MetalHelper.shared.device)]
+
+enum SupportModel: String{
+  //  case mobilenet = "mobilenet"
+//  case mobilenet_ssd    = "mobilenetssd"
+  case genet            = "genet"
+  case mobilenet_ssd_ar = "mobilenetssd_ar"
+  
+  static func supportedModels() -> [SupportModel] {
+    // .mobilenet,
+    // .mobilenet_ssd,
+    return [.genet, .mobilenet_ssd_ar]
+  }
+}

 class ViewController: UIViewController {
-    @IBOutlet weak var resultTextView: UITextView!
-    @IBOutlet weak var selectImageView: UIImageView!
-    @IBOutlet weak var elapsedTimeLabel: UILabel!
-    @IBOutlet weak var modelPickerView: UIPickerView!
-    @IBOutlet weak var threadPickerView: UIPickerView!
-    var selectImage: UIImage?
-    var program: Program?
-    var executor: Executor<Float32>?
-    var modelType: SupportModel = .mobilenet
-    var toPredictTexture: MTLTexture?
-    var modelHelper: ModelHelper {
-        return modelHelperMap[modelType] ?! " has no this type "
-    }
-    var threadNum = 1
+  @IBOutlet weak var resultTextView: UITextView!
+  @IBOutlet weak var selectImageView: UIImageView!
+  @IBOutlet weak var elapsedTimeLabel: UILabel!
+  @IBOutlet weak var modelPickerView: UIPickerView!
+  @IBOutlet weak var threadPickerView: UIPickerView!
+  @IBOutlet weak var videoView: UIView!
+//  var videoCapture: VideoCapture!
+
+  var selectImage: UIImage?
+  var inputPointer: UnsafeMutablePointer<Float32>?
+  var modelType: SupportModel = SupportModel.supportedModels()[0]
+  var toPredictTexture: MTLTexture?
+  
+  var runner: Runner!
+  
+  var threadNum = 1
+  
+  @IBAction func loadAct(_ sender: Any) {
+     runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue, inPlatform: platform)
    
-    @IBAction func loadAct(_ sender: Any) {
-        let inModelHelper = modelHelper
-        let queue = MetalHelper.shared.queue
-        let loader = Loader<Float32>.init()
-        do {
-            let modelPath = inModelHelper.modelPath
-            let paraPath = inModelHelper.paramPath
-            
-            program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath)
-            executor = try Executor<Float32>.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program!)
-        } catch let error {
-            print(error)
+    if platform == .CPU {
+      if inputPointer == nil {
+        inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+       
+      }
+    } else if platform == .GPU {
+      if self.toPredictTexture == nil {
+        runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+          self?.toPredictTexture = texture
        }
+      }
+    } else {
+      fatalError( " unsupport " )
    }
    
-    @IBAction func selectImageAct(_ sender: Any) {
-        let imagePicker = UIImagePickerController()
-        imagePicker.sourceType = .camera
-        imagePicker.delegate = self
-        self.present(imagePicker, animated: true, completion: nil)
-    }
-    
-    @IBAction func clearAct(_ sender: Any) {
-        executor?.clear()
-        program = nil
-        executor = nil
-        
+    if runner.load() {
+      print(" load success ! ")
+    } else {
+      print(" load error ! ")
    }
-    
-    @IBAction func predictAct(_ sender: Any) {        
-        guard let inTexture = toPredictTexture else {
-            resultTextView.text = "请选择图片 ! "
-            return
+  }
+  
+  @IBAction func selectImageAct(_ sender: Any) {
+    let imagePicker = UIImagePickerController()
+    imagePicker.sourceType = .camera
+    imagePicker.delegate = self
+    self.present(imagePicker, animated: true, completion: nil)
+  }
+  
+  @IBAction func clearAct(_ sender: Any) {
+    runner.clear()
+  }
+  
+  @IBAction func predictAct(_ sender: Any) {
+    let max = 50
+    switch platform {
+    case .GPU:
+      guard let inTexture = toPredictTexture else {
+        resultTextView.text = "请选择图片 ! "
+        return
+      }
+      
+      for _ in 0..<10{
+        runner.predict(texture: inTexture) { (success, resultHolder)  in
+          resultHolder?.releasePointer()
+        }
+      }
+      
+      let startDate = Date.init()
+      for i in 0..<max {
+        runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
+          guard let sSelf = self else {
+            fatalError()
+          }
+          if success {
+            if i == max - 1 {
+              let time = Date.init().timeIntervalSince(startDate)
+              DispatchQueue.main.async {
+//                print(resultHolder!.result![0])
+                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
+                
+                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+               
+              }
+            }
+          }
+          
+          DispatchQueue.main.async {
+            resultHolder?.releasePointer()
+          }
+//            print("释放")
        }
-        
-        guard let inExecutor = executor else {
-            resultTextView.text = "请先 load ! "
-            return
+//        print("sleep before ")
+//        usleep(33000)
+//        print("sleep after ")
+      }
+    case .CPU:
+      guard let inInputPointer = inputPointer else {
+        fatalError( " need input pointer " )
+      }
+      
+      for _ in 0..<10 {
+        runner.predict(inputPointer: inInputPointer) { (success, res) in
+          res?.releaseOutput()
        }
-
-        do {
-            let max = 100
-            var startDate = Date.init()
-            for i in 0..<max {
-                try inExecutor.predict(input: inTexture, expect: modelHelper.dim, completionHandle: { [weak self] (result) in
-                    guard let sSelf = self else {
-                        fatalError()
-                    }
-                    
-                    if i == (max / 2 - 1) {
-                        startDate = Date.init()
-                    }
-                    
-                    if i == max - 1 {
-                        let time = Date.init().timeIntervalSince(startDate)
-                        DispatchQueue.main.async {
-                            sSelf.resultTextView.text = sSelf.modelHelper.resultStr(res: result.resultArr)
-                            sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max/2) * 1000.0) ms"
-                        }
-                    }
-                }, preProcessKernle: self.modelHelper.preprocessKernel)
+      }
+      
+      let startDate = Date.init()
+      for i in 0..<max {
+        runner.predict(inputPointer: inInputPointer) { [weak self](success, res) in
+          guard let sSelf = self else {
+            fatalError()
+          }
+          if success {
+            if i == max - 1 {
+              let time = Date.init().timeIntervalSince(startDate)
+              DispatchQueue.main.async {
+//                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
+                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+              }
            }
-        } catch let error {
-            print(error)
+          }
+          res?.releaseOutput()
        }
+      }
    }
+  }
+  
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    
+//    if runner.load() {
+//      print(" load success ! ")
+//    } else {
+//      print(" load error ! ")
+//    }
+//    
+    modelPickerView.delegate = self
+    modelPickerView.dataSource = self
+    threadPickerView.delegate = self
+    threadPickerView.dataSource = self
+    
+    selectImage = UIImage.init(named: "hand.jpg")
+    selectImageView.image = selectImage
+    
+//    if platform == .CPU {
+//      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+//    } else if platform == .GPU {
+//      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+//        self?.toPredictTexture = texture
+//      }
+//    } else {
+//      fatalError( " unsupport " )
+//    }
+    
+//    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
+//    videoCapture.fps = 30
+//    videoCapture.delegate = self
+//    videoCapture.setUp { (success) in
+//      DispatchQueue.main.async {
+//        if let preViewLayer = self.videoCapture.previewLayer {
+//          self.videoView.layer.addSublayer(preViewLayer)
+//          self.videoCapture.previewLayer?.frame = self.videoView.bounds
+//        }
+//        self.videoCapture.start()
+//      }
+//    }

-    override func viewDidLoad() {
-        super.viewDidLoad()
-        modelPickerView.delegate = self
-        modelPickerView.dataSource = self
-        threadPickerView.delegate = self
-        threadPickerView.dataSource = self
-        
-        selectImage = UIImage.init(named: "banana.jpeg")
-        selectImageView.image = selectImage
-        modelHelper.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
-            self?.toPredictTexture = texture
-        }
-    }
+  }
 }

 extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
-    func numberOfComponents(in pickerView: UIPickerView) -> Int {
-        if pickerView == modelPickerView {
-            return 1
-        } else if pickerView == threadPickerView {
-            return 1
-        } else {
-            fatalError()
-        }
+  func numberOfComponents(in pickerView: UIPickerView) -> Int {
+    if pickerView == modelPickerView {
+      return 1
+    } else if pickerView == threadPickerView {
+      return 1
+    } else {
+      fatalError()
    }
-    
-    func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
-        if pickerView == modelPickerView {
-            return SupportModel.supportedModels().count
-        } else if pickerView == threadPickerView {
-            return threadSupport.count
-        } else {
-            fatalError()
-        }
+  }
+  
+  func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
+    if pickerView == modelPickerView {
+      return SupportModel.supportedModels().count
+    } else if pickerView == threadPickerView {
+      return threadSupport.count
+    } else {
+      fatalError()
    }
-    
-    public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
-        if pickerView == modelPickerView {
-            return SupportModel.supportedModels()[row].rawValue
-        } else if pickerView == threadPickerView {
-            return "\(threadSupport[row])"
-        } else {
-            fatalError()
-        }
+  }
+  
+  public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
+    if pickerView == modelPickerView {
+      return SupportModel.supportedModels()[row].rawValue
+    } else if pickerView == threadPickerView {
+      return threadSupport[row].1
+    } else {
+      fatalError()
    }
-    
-    public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
-        if pickerView == modelPickerView {
-            self.modelType = SupportModel.supportedModels()[row]
-        } else if pickerView == threadPickerView {
-            self.threadNum = threadSupport[row]
-        } else {
-            fatalError()
-        }
+  }
+  
+  public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
+    if pickerView == modelPickerView {
+      self.modelType = SupportModel.supportedModels()[row]
+    } else if pickerView == threadPickerView {
+      
+      platform = threadSupport[row].0
+    } else {
+      fatalError()
    }
+  }
 }

 extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-        picker.dismiss(animated: true){[weak self] in
-            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
-                fatalError("no image")
-            }
-            sSelf.selectImage = image
-            sSelf.selectImageView.image = image
-            sSelf.modelHelper.getTexture(image: image.cgImage!, getTexture: { (texture) in
-                sSelf.toPredictTexture = texture
-            })
-        }
+  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+    picker.dismiss(animated: true){[weak self] in
+      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
+        fatalError("no image")
+      }
+      sSelf.selectImage = image
+      sSelf.selectImageView.image = image
+      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+        sSelf.toPredictTexture = texture
+      })
    }
+  }
 }

+var bool1 = false
+extension ViewController: VideoCaptureDelegate{
+  func predictTexture(texture: MTLTexture){
+    runner.scaleTexture(input: texture) { (scaledTexture) in
+      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
+//        print(resultHolder!.result![0])
+        resultHolder?.releasePointer()
+      })
+    }
+  }
+  
+  
+//  @available(iOS 10.0, *)
+//  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
+////    if !bool1 {
+////      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
+//    self.predictTexture(texture: texture!)
+////      }
+//
+//
+////      bool1 = true
+////    }
+//
+//  }
+
+}
+
+
+

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#import <paddle_mobile/paddle_mobile.h>
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>SchemeUserState</key>
-	<dict>
-		<key>paddle-mobile-unit-test.xcscheme</key>
-		<dict>
-			<key>orderHint</key>
-			<integer>6</integer>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
-//
-//  AppDelegate.swift
-//  paddle-mobile-unit-test
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import UIKit


--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
-//
-//  ViewController.swift
-//  paddle-mobile-unit-test
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import UIKit
+import Metal
+//import MetalKit
 import paddle_mobile

 class ViewController: UIViewController {
-
    override func viewDidLoad() {
        super.viewDidLoad()
+        let device = Metal.MTLCreateSystemDefaultDevice()!
+        let queue = device.makeCommandQueue()!
+        let test = PaddleMobileUnitTest.init(
+            inDevice: device,
+            inQueue: queue
+        )
+        test.testConcat()
+//        test.testReshape()
+//        test.testTranspose()
        print(" done ")
    }


--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -7,7 +7,31 @@
 	objects = {

 /* Begin PBXBuildFile section */
+		4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; };
+		4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */; };
+		4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA892146631C00D0F791 /* BilinearInterp.metal */; };
+		4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8B2146640900D0F791 /* SplitOp.swift */; };
+		4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */; };
+		4AA1EA90214664CD00D0F791 /* Split.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8F214664CD00D0F791 /* Split.metal */; };
+		4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA91214665D700D0F791 /* ShapeOp.swift */; };
+		4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA932146661500D0F791 /* ShapeKernel.swift */; };
+		4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA972146666500D0F791 /* FlattenOp.swift */; };
+		4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */ = {isa = PBXBuildFile; fileRef = 4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */; };
+		4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */; };
+		4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */; };
+		4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */; };
+		4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA5214B5F6800D0F791 /* Shape.metal */; };
+		4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */; };
+		4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */; };
+		4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */; };
+		4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */; };
+		4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928762133F1DB005B6C3A /* BoxCoder.metal */; };
+		4AF9287921341661005B6C3A /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9287821341661005B6C3A /* Softmax.metal */; };
+		4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* ConcatKernel.metal */; };
+		4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9288321357BE3005B6C3A /* Elementwise.metal */; };
 		D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */; };
+		FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226552138F33800F395E2 /* TransposeKernel.metal */; };
+		FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226572138F38D00F395E2 /* PoolKernel.metal */; };
 		FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */ = {isa = PBXBuildFile; fileRef = FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9420E11C9A0081E9F8 /* Extensions.swift */; };
 		FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9520E11C9A0081E9F8 /* Errors.swift */; };
@@ -35,17 +59,54 @@
 		FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; };
 		FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
 		FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; };
-		FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1B186520ECF1C600678B91 /* ResizeKernel.swift */; };
+		FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */ = {isa = PBXBuildFile; fileRef = FC292C5521421B4600CF622F /* PaddleMobileGPU.m */; };
+		FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7C214255BC00CF622F /* CPUCompute.mm */; };
+		FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7E214255BC00CF622F /* MobileNetSSD.swift */; };
+		FC292C85214257CB00CF622F /* CPUCompute.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C7D214255BC00CF622F /* CPUCompute.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC292C872142624800CF622F /* Genet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C862142624800CF622F /* Genet.swift */; };
+		FC33B0F02147659000714A93 /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC33B0EF2147659000714A93 /* MobileNet.swift */; };
 		FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
 		FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
 		FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
+		FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */; };
+		FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD9782140E4980073E130 /* libpaddle-mobile.a */; };
+		FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD97D2140F2C30073E130 /* libstdc++.tbd */; };
 		FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; };
 		FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC60DB8820E9AAA500FF203F /* MetalExtension.swift */; };
+		FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */; };
+		FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */; };
+		FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */; };
+		FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */; };
+		FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC6214CBA820094B8E5 /* Macro.metal */; };
+		FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */; };
 		FC82735920E3C04200BE430A /* OpCreator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC82735820E3C04200BE430A /* OpCreator.swift */; };
+		FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */; };
 		FC9D037920E229E4000F735A /* OpParam.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037820E229E4000F735A /* OpParam.swift */; };
 		FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037F20E22FBB000F735A /* FeedOp.swift */; };
 		FC9D038220E2312E000F735A /* FetchOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038120E2312E000F735A /* FetchOp.swift */; };
 		FC9D038420E23B01000F735A /* Texture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038320E23B01000F735A /* Texture.swift */; };
+		FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */; };
+		FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1642132A5EB00084FE5 /* Common.metal */; };
+		FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */; };
+		FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD42138272900BD58AA /* ConvAddMetal.metal */; };
+		FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */; };
+		FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */; };
+		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
+		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
+		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
+		FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */; };
+		FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */; };
+		FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */; };
+		FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */; };
+		FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC642122FCD700D94F7E /* TransposeOp.swift */; };
+		FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC66212306B000D94F7E /* ConcatOp.swift */; };
+		FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC68212306D300D94F7E /* ConcatKernel.swift */; };
+		FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */; };
+		FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */; };
+		FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */; };
+		FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */; };
 		FCD04E6620F314C50007374F /* PoolOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6520F314C50007374F /* PoolOp.swift */; };
 		FCD04E6820F315020007374F /* PoolKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6720F315020007374F /* PoolKernel.swift */; };
 		FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6920F319EC0007374F /* SoftmaxOp.swift */; };
@@ -55,15 +116,55 @@
 		FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7120F343420007374F /* ConvAddOp.swift */; };
 		FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7320F3437E0007374F /* ConvAddKernel.swift */; };
 		FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDC0FEA21099A1D00DC9EFB /* Tools.swift */; };
+		FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */; };
+		FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */; };
+		FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */; };
+		FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */; };
+		FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */; };
+		FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; };
+		FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */; };
+		FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */; };
+		FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */; };
+		FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */; };
+		FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */; };
+		FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */; };
+		FCE9D7B7214F869000B520C3 /* Net.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B6214F869000B520C3 /* Net.swift */; };
+		FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */; };
+		FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; };
+		FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; };
 		FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */; };
 		FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */; };
 		FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2D73720E64E70007AC5F5 /* Kernel.swift */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
+		4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = "<group>"; };
+		4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpKernel.swift; sourceTree = "<group>"; };
+		4AA1EA892146631C00D0F791 /* BilinearInterp.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.metal; sourceTree = "<group>"; };
+		4AA1EA8B2146640900D0F791 /* SplitOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitOp.swift; sourceTree = "<group>"; };
+		4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitKernel.swift; sourceTree = "<group>"; };
+		4AA1EA8F214664CD00D0F791 /* Split.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.metal; sourceTree = "<group>"; };
+		4AA1EA91214665D700D0F791 /* ShapeOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeOp.swift; sourceTree = "<group>"; };
+		4AA1EA932146661500D0F791 /* ShapeKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeKernel.swift; sourceTree = "<group>"; };
+		4AA1EA972146666500D0F791 /* FlattenOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenOp.swift; sourceTree = "<group>"; };
+		4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ConcatKernel.inc.metal; sourceTree = "<group>"; };
+		4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ReshapeKernel.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenKernel.swift; sourceTree = "<group>"; };
+		4AA1EAA3214A295C00D0F791 /* Split.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA5214B5F6800D0F791 /* Shape.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shape.metal; sourceTree = "<group>"; };
+		4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.inc.metal; sourceTree = "<group>"; };
+		4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.inc.metal; sourceTree = "<group>"; };
+		4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.inc.metal; sourceTree = "<group>"; };
+		4AF928762133F1DB005B6C3A /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; };
+		4AF9287821341661005B6C3A /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = "<group>"; };
+		4AF928812135673D005B6C3A /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = "<group>"; };
+		4AF9288321357BE3005B6C3A /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
 		CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.debug.xcconfig"; sourceTree = "<group>"; };
 		DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.release.xcconfig"; sourceTree = "<group>"; };
+		FC0226552138F33800F395E2 /* TransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
+		FC0226572138F38D00F395E2 /* PoolKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PoolKernel.metal; sourceTree = "<group>"; };
 		FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = paddle_mobile.h; sourceTree = "<group>"; };
 		FC039B6E20E11C3C0081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
@@ -93,17 +194,54 @@
 		FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; };
 		FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; };
 		FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; };
-		FC1B186520ECF1C600678B91 /* ResizeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ResizeKernel.swift; sourceTree = "<group>"; };
+		FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileGPU.h; sourceTree = "<group>"; };
+		FC292C5521421B4600CF622F /* PaddleMobileGPU.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PaddleMobileGPU.m; sourceTree = "<group>"; };
+		FC292C7C214255BC00CF622F /* CPUCompute.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = CPUCompute.mm; sourceTree = "<group>"; };
+		FC292C7D214255BC00CF622F /* CPUCompute.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CPUCompute.h; sourceTree = "<group>"; };
+		FC292C7E214255BC00CF622F /* MobileNetSSD.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileNetSSD.swift; sourceTree = "<group>"; };
+		FC292C862142624800CF622F /* Genet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Genet.swift; sourceTree = "<group>"; };
+		FC33B0EF2147659000714A93 /* MobileNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
 		FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; };
 		FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
 		FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProgramOptimize.swift; sourceTree = "<group>"; };
+		FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PaddleMobile.swift; sourceTree = "<group>"; };
+		FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileCPU.h; sourceTree = "<group>"; };
+		FC4FD9782140E4980073E130 /* libpaddle-mobile.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = "libpaddle-mobile.a"; sourceTree = "<group>"; };
+		FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
 		FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = "<group>"; };
 		FC60DB8820E9AAA500FF203F /* MetalExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalExtension.swift; sourceTree = "<group>"; };
+		FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluOp.swift; sourceTree = "<group>"; };
+		FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluKernel.swift; sourceTree = "<group>"; };
+		FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPreluKernel.metal; sourceTree = "<group>"; };
+		FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPrelu.inc.metal; sourceTree = "<group>"; };
+		FC803BC6214CBA820094B8E5 /* Macro.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = "<group>"; };
+		FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = "<group>"; };
 		FC82735820E3C04200BE430A /* OpCreator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpCreator.swift; sourceTree = "<group>"; };
+		FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobilenetSSD_AR.swift; sourceTree = "<group>"; };
 		FC9D037820E229E4000F735A /* OpParam.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpParam.swift; sourceTree = "<group>"; };
 		FC9D037F20E22FBB000F735A /* FeedOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeedOp.swift; sourceTree = "<group>"; };
 		FC9D038120E2312E000F735A /* FetchOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchOp.swift; sourceTree = "<group>"; };
 		FC9D038320E23B01000F735A /* Texture.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture.swift; sourceTree = "<group>"; };
+		FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
+		FCA3A1642132A5EB00084FE5 /* Common.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = "<group>"; };
+		FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = "<group>"; };
+		FCA67CD42138272900BD58AA /* ConvAddMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
+		FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = "<group>"; };
+		FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = "<group>"; };
+		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
+		FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthwiseConvOp.swift; sourceTree = "<group>"; };
+		FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxOp.swift; sourceTree = "<group>"; };
+		FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxKernel.swift; sourceTree = "<group>"; };
+		FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeKernel.swift; sourceTree = "<group>"; };
+		FCBCCC642122FCD700D94F7E /* TransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeOp.swift; sourceTree = "<group>"; };
+		FCBCCC66212306B000D94F7E /* ConcatOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatOp.swift; sourceTree = "<group>"; };
+		FCBCCC68212306D300D94F7E /* ConcatKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderOp.swift; sourceTree = "<group>"; };
+		FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSOp.swift; sourceTree = "<group>"; };
+		FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSKernel.swift; sourceTree = "<group>"; };
 		FCD04E6520F314C50007374F /* PoolOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolOp.swift; sourceTree = "<group>"; };
 		FCD04E6720F315020007374F /* PoolKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolKernel.swift; sourceTree = "<group>"; };
 		FCD04E6920F319EC0007374F /* SoftmaxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxOp.swift; sourceTree = "<group>"; };
@@ -113,9 +251,25 @@
 		FCD04E7120F343420007374F /* ConvAddOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddOp.swift; sourceTree = "<group>"; };
 		FCD04E7320F3437E0007374F /* ConvAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddKernel.swift; sourceTree = "<group>"; };
 		FCDC0FEA21099A1D00DC9EFB /* Tools.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tools.swift; sourceTree = "<group>"; };
+		FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = BatchNormKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
+		FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = "<group>"; };
+		FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
+		FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.metal; sourceTree = "<group>"; };
+		FCE9D7B6214F869000B520C3 /* Net.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Net.swift; sourceTree = "<group>"; };
+		FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = "<group>"; };
+		FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
+		FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = "<group>"; };
 		FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = ConvAddBatchNormReluOp.swift; path = "paddle-mobile/Operators/ConvAddBatchNormReluOp.swift"; sourceTree = SOURCE_ROOT; };
 		FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluKernel.swift; sourceTree = "<group>"; };
-		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Kernel.swift"; sourceTree = SOURCE_ROOT; };
+		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Base/Kernel.swift"; sourceTree = SOURCE_ROOT; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@@ -123,7 +277,9 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */,
 				D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */,
+				FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -133,6 +289,7 @@
 		336CBE234BF5DE48658DE65F /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				FC4FD97D2140F2C30073E130 /* libstdc++.tbd */,
 				DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */,
 			);
 			name = Frameworks;
@@ -168,10 +325,19 @@
 		FC039B6C20E11C3C0081E9F8 /* paddle-mobile */ = {
 			isa = PBXGroup;
 			children = (
+				FCE9D7B6214F869000B520C3 /* Net.swift */,
+				FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */,
+				FC33B0EF2147659000714A93 /* MobileNet.swift */,
+				FC292C862142624800CF622F /* Genet.swift */,
+				FC292C7E214255BC00CF622F /* MobileNetSSD.swift */,
+				FC292C7C214255BC00CF622F /* CPUCompute.mm */,
+				FC292C7D214255BC00CF622F /* CPUCompute.h */,
+				FC292C5521421B4600CF622F /* PaddleMobileGPU.m */,
+				FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */,
+				FC4FD9762140E4920073E130 /* CPU */,
+				FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */,
 				FC039BAE20E11CC20081E9F8 /* Program */,
 				FC039BA320E11CBC0081E9F8 /* Operators */,
-				FC039BA120E11CB70081E9F8 /* Loader.swift */,
-				FC039B9A20E11CA00081E9F8 /* Executor.swift */,
 				FC039B9C20E11CB20081E9F8 /* framework */,
 				FC039B9320E11C9A0081E9F8 /* Common */,
 				FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */,
@@ -196,6 +362,8 @@
 		FC039B9C20E11CB20081E9F8 /* framework */ = {
 			isa = PBXGroup;
 			children = (
+				FC039BA120E11CB70081E9F8 /* Loader.swift */,
+				FC039B9A20E11CA00081E9F8 /* Executor.swift */,
 				FC039B9D20E11CB20081E9F8 /* Tensor.swift */,
 				FC039B9E20E11CB20081E9F8 /* Dim.swift */,
 				FC9D038320E23B01000F735A /* Texture.swift */,
@@ -219,6 +387,23 @@
 				FCD04E6920F319EC0007374F /* SoftmaxOp.swift */,
 				FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */,
 				FCD04E7120F343420007374F /* ConvAddOp.swift */,
+				FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */,
+				FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */,
+				FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */,
+				FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */,
+				FCBCCC642122FCD700D94F7E /* TransposeOp.swift */,
+				FCBCCC66212306B000D94F7E /* ConcatOp.swift */,
+				FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */,
+				4AA1EA8B2146640900D0F791 /* SplitOp.swift */,
+				4AA1EA91214665D700D0F791 /* ShapeOp.swift */,
+				4AA1EA972146666500D0F791 /* FlattenOp.swift */,
+				4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */,
+				FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */,
+				FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */,
+				FCEB684B212F093800D2448E /* PreluOp.swift */,
+				FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
+				FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */,
+				FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */,
 			);
 			path = Operators;
 			sourceTree = "<group>";
@@ -243,24 +428,46 @@
 		FC086BA520E67E8500D85EF7 /* Kernels */ = {
 			isa = PBXGroup;
 			children = (
+				FCDDC6CD212FE02100E5EF74 /* Base */,
+				FCEB6837212F00B100D2448E /* metal */,
+				FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
 				FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
-				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
-				FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
-				FC1B186520ECF1C600678B91 /* ResizeKernel.swift */,
 				FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
 				FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
 				FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
 				FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */,
-				FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
 				FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */,
 				FCD04E6720F315020007374F /* PoolKernel.swift */,
 				FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */,
 				FCD04E6F20F31B720007374F /* ReshapeKernel.swift */,
+				4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */,
 				FCD04E7320F3437E0007374F /* ConvAddKernel.swift */,
+				FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */,
+				FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */,
+				FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */,
+				FCBCCC68212306D300D94F7E /* ConcatKernel.swift */,
+				FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */,
+				4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */,
+				4AA1EA932146661500D0F791 /* ShapeKernel.swift */,
+				4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */,
+				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
+				FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */,
+				FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */,
+				FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */,
+				FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
 		};
+		FC4FD9762140E4920073E130 /* CPU */ = {
+			isa = PBXGroup;
+			children = (
+				FC4FD9782140E4980073E130 /* libpaddle-mobile.a */,
+				FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */,
+			);
+			path = CPU;
+			sourceTree = "<group>";
+		};
 		FCD592FA20E248EC00252966 /* Base */ = {
 			isa = PBXGroup;
 			children = (
@@ -271,6 +478,56 @@
 			path = Base;
 			sourceTree = "<group>";
 		};
+		FCDDC6CD212FE02100E5EF74 /* Base */ = {
+			isa = PBXGroup;
+			children = (
+				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
+			);
+			path = Base;
+			sourceTree = "<group>";
+		};
+		FCEB6837212F00B100D2448E /* metal */ = {
+			isa = PBXGroup;
+			children = (
+				4AF928812135673D005B6C3A /* ConcatKernel.metal */,
+				4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */,
+				4AF9288321357BE3005B6C3A /* Elementwise.metal */,
+				FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
+				FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
+				4AF928762133F1DB005B6C3A /* BoxCoder.metal */,
+				4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */,
+				4AA1EAA5214B5F6800D0F791 /* Shape.metal */,
+				4AA1EA8F214664CD00D0F791 /* Split.metal */,
+				4AA1EAA3214A295C00D0F791 /* Split.inc.metal */,
+				4AA1EA892146631C00D0F791 /* BilinearInterp.metal */,
+				4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */,
+				4AF9287821341661005B6C3A /* Softmax.metal */,
+				4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */,
+				FCEB6849212F00DB00D2448E /* PreluKernel.metal */,
+				FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */,
+				FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */,
+				FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */,
+				FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */,
+				4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */,
+				FCA3A1642132A5EB00084FE5 /* Common.metal */,
+				FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */,
+				FCA67CD42138272900BD58AA /* ConvAddMetal.metal */,
+				FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */,
+				FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */,
+				FC0226552138F33800F395E2 /* TransposeKernel.metal */,
+				4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */,
+				FC0226572138F38D00F395E2 /* PoolKernel.metal */,
+				FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */,
+				FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */,
+				FC803BC6214CBA820094B8E5 /* Macro.metal */,
+				FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */,
+				FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */,
+				FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */,
+				FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */,
+			);
+			path = metal;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */

 /* Begin PBXHeadersBuildPhase section */
@@ -278,6 +535,10 @@
 			isa = PBXHeadersBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */,
+				FC292C85214257CB00CF622F /* CPUCompute.h in Headers */,
+				FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */,
+				4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */,
 				FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -315,6 +576,7 @@
 				TargetAttributes = {
 					FC039B6920E11C3C0081E9F8 = {
 						CreatedOnToolsVersion = 9.3.1;
+						LastSwiftMigration = 0940;
 					};
 				};
 			};
@@ -372,53 +634,124 @@
 			buildActionMask = 2147483647;
 			files = (
 				FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
+				4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */,
 				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
+				FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */,
+				FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */,
+				4AF9287921341661005B6C3A /* Softmax.metal in Sources */,
+				4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */,
 				FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
 				FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
+				FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */,
+				FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
+				4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */,
+				4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */,
+				FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */,
 				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
+				4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */,
 				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
+				FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */,
 				FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */,
 				FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */,
+				FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */,
 				FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
 				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
-				FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
 				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
+				FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
+				FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
+				FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
+				FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */,
+				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
+				4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */,
+				FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */,
 				FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
+				4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
+				4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */,
+				FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
+				FC33B0F02147659000714A93 /* MobileNet.swift in Sources */,
+				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
+				4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */,
+				FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */,
 				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
 				FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */,
+				4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */,
+				FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */,
 				FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */,
 				FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */,
 				FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */,
 				FC039BB820E11CC20081E9F8 /* framework.pb.swift in Sources */,
 				FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */,
 				FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */,
+				FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */,
+				FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */,
+				FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
 				FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
 				FC9D038420E23B01000F735A /* Texture.swift in Sources */,
+				FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */,
+				4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */,
+				4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */,
+				FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
+				4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */,
 				FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */,
 				FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */,
 				FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */,
+				4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */,
 				FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */,
 				FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */,
+				FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */,
 				FCD04E6620F314C50007374F /* PoolOp.swift in Sources */,
+				FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */,
 				FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */,
+				FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
 				FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */,
+				FC292C872142624800CF622F /* Genet.swift in Sources */,
+				FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */,
+				4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */,
+				FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
+				FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */,
 				FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */,
 				FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */,
+				4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */,
+				FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */,
+				FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */,
+				FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */,
 				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
+				FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */,
+				4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */,
+				FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
+				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
+				FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */,
+				FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
 				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
 				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
 				FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */,
+				FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */,
+				FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */,
+				FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */,
 				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
+				FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */,
 				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
 				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
+				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
 				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
+				FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */,
+				4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */,
+				FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */,
+				FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */,
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
+				FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */,
 				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
+				FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */,
 				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
+				FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */,
+				4AA1EA90214664CD00D0F791 /* Split.metal in Sources */,
 				FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
+				FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */,
 				FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
+				FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
 				FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */,
+				4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */,
 				FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -550,6 +883,7 @@
 			isa = XCBuildConfiguration;
 			baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */;
 			buildSettings = {
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
@@ -557,6 +891,7 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
@@ -565,10 +900,16 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/paddle-mobile/CPU",
+				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				SKIP_INSTALL = YES;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 4.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -578,6 +919,7 @@
 			isa = XCBuildConfiguration;
 			baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */;
 			buildSettings = {
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
@@ -585,6 +927,7 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
@@ -593,6 +936,11 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/paddle-mobile/CPU",
+				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile.xcscheme
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile.xcscheme
-<?xml version="1.0" encoding="UTF-8"?>
-<Scheme
-   LastUpgradeVersion = "0940"
-   version = "1.3">
-   <BuildAction
-      parallelizeBuildables = "YES"
-      buildImplicitDependencies = "YES">
-      <BuildActionEntries>
-         <BuildActionEntry
-            buildForTesting = "YES"
-            buildForRunning = "YES"
-            buildForProfiling = "YES"
-            buildForArchiving = "YES"
-            buildForAnalyzing = "YES">
-            <BuildableReference
-               BuildableIdentifier = "primary"
-               BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
-               BuildableName = "paddle_mobile.framework"
-               BlueprintName = "paddle-mobile"
-               ReferencedContainer = "container:paddle-mobile.xcodeproj">
-            </BuildableReference>
-         </BuildActionEntry>
-      </BuildActionEntries>
-   </BuildAction>
-   <TestAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      shouldUseLaunchSchemeArgsEnv = "YES">
-      <Testables>
-      </Testables>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </TestAction>
-   <LaunchAction
-      buildConfiguration = "Debug"
-      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
-      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
-      launchStyle = "0"
-      useCustomWorkingDirectory = "NO"
-      ignoresPersistentStateOnLaunch = "NO"
-      debugDocumentVersioning = "YES"
-      debugServiceExtension = "internal"
-      allowLocationSimulation = "YES">
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
-            BuildableName = "paddle_mobile.framework"
-            BlueprintName = "paddle-mobile"
-            ReferencedContainer = "container:paddle-mobile.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-      <AdditionalOptions>
-      </AdditionalOptions>
-   </LaunchAction>
-   <ProfileAction
-      buildConfiguration = "Release"
-      shouldUseLaunchSchemeArgsEnv = "YES"
-      savedToolIdentifier = ""
-      useCustomWorkingDirectory = "NO"
-      debugDocumentVersioning = "YES">
-      <MacroExpansion>
-         <BuildableReference
-            BuildableIdentifier = "primary"
-            BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
-            BuildableName = "paddle_mobile.framework"
-            BlueprintName = "paddle-mobile"
-            ReferencedContainer = "container:paddle-mobile.xcodeproj">
-         </BuildableReference>
-      </MacroExpansion>
-   </ProfileAction>
-   <AnalyzeAction
-      buildConfiguration = "Debug">
-   </AnalyzeAction>
-   <ArchiveAction
-      buildConfiguration = "Release"
-      revealArchiveInOrganizer = "YES">
-   </ArchiveAction>
-</Scheme>
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>SchemeUserState</key>
-	<dict>
-		<key>paddle-mobile.xcscheme</key>
-		<dict>
-			<key>orderHint</key>
-			<integer>0</integer>
-		</dict>
-	</dict>
-	<key>SuppressBuildableAutocreation</key>
-	<dict>
-		<key>FC039B6920E11C3C0081E9F8</key>
-		<dict>
-			<key>primary</key>
-			<true/>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
+++ b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobileCPUResult: NSObject
+
+@property (assign, nonatomic, readonly) float *output;
+
+@property (assign, nonatomic, readonly) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface PaddleMobileCPU : NSObject
+
+/*
+    创建对象
+*/
+- (instancetype)init;
+
+/*
+    load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+  加载散开形式的模型, 需传入模型的目录
+*/
+- (BOOL)load:(NSString *)modelAndWeightPath;
+
+/*
+ * 从内存中加载模型
+ * */
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
+
+/*
+ *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
+ * */
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim;
+
+/*
+ * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ * */
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                                    dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+    进行预测, 默认 means 为 0, scale 为 1.0
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    清理内存
+*/
+- (void)clear;
+
+@end
--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.h
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <Foundation/Foundation.h>
+
+
+@interface CPUResult: NSObject
+@property (assign, nonatomic) float *output;
+@property (assign, nonatomic) int outputSize;
+@end
+
+@interface NMSCompute: NSObject
+
+@property (assign, nonatomic) float scoreThredshold;
+
+@property (assign, nonatomic) int nmsTopK;
+
+@property (assign, nonatomic) int keepTopK;
+
+@property (assign, nonatomic) float nmsEta;
+
+@property (assign, nonatomic) float nmsThreshold;
+
+@property (assign, nonatomic) int background_label;
+
+@property (strong, nonatomic) NSArray<NSNumber *> *scoreDim;
+
+@property (strong, nonatomic) NSArray<NSNumber *> *bboxDim;
+
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
+
+@end
--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+#import "CPUCompute.h"
+
+#import <map>
+#import <vector>
+#import <utility>
+#import <algorithm>
+
+
+
+
+struct NMSParam {
+  
+  float *score_data;
+  
+  float *box_data;
+  
+  float *output;
+  
+  int output_size;
+  
+  std::vector<int> score_dim;
+  
+  std::vector<int> box_dim;
+  
+  float scoreThredshold;
+  
+  int nmsTopK;
+  
+  int keepTopK;
+  
+  float nmsEta;
+  
+  float nmsThreshold;
+  
+  int background_label;
+};
+
+
+constexpr int kOutputDim = 6;
+constexpr int kBBoxSize = 4;
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+                                    const std::vector<T>& scores, const T threshold, int top_k,
+                                    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+static inline void NMSFast(
+                           const T *bbox_data,
+                           std::vector<int> bbox_dim,
+                           const T *score_data,
+                           const T score_threshold, const T nms_threshold,
+                           const T eta, const int top_k,
+                           std::vector<int>* selected_indices) {
+  // The total boxes for each instance.
+  int num_boxes = bbox_dim[0];
+  // 4: [xmin ymin xmax ymax]
+  int box_size = bbox_dim[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(score_data, num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, true);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const T *boxes_data,
+                   const std::vector<int> &box_dim,
+                   const T *scores_data,
+                   const std::vector<int> &score_dim,
+                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
+                   const int& background_label, const int& nms_top_k,
+                   const int& keep_top_k, const T& nms_threshold,
+                   const T& nms_eta, const T& score_threshold) {
+  
+  int64_t class_num = score_dim[0];
+  int64_t predict_dim = score_dim[1];
+  int num_det = 0;
+  for (int c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    const T *score_data = scores_data + c * predict_dim;
+    
+    /// [c] is key
+    NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
+                   nms_top_k, &((*indices)[c]));
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        // PADDLE_ENFORCE_LT(idx, predict_dim);
+        score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const T *scores_data,
+                      const std::vector<int> &score_dim,
+                      const T *bboxes_data,
+                      T *outputs_data,
+                      const std::map<int, std::vector<int>>& selected_indices) {
+  int predict_dim = score_dim[1];
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    /// one batch
+    int label = it.first;
+    const T* sdata = scores_data + label * predict_dim;
+    const std::vector<int>& indices = it.second;
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      const T* bdata = bboxes_data + idx * kBBoxSize;
+      outputs_data[count * kOutputDim] = label;           // label
+      outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
+      // xmin, ymin, xmax, ymax
+      std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+      count++;
+    }
+  }
+}
+
+void MultiClassNMSCompute(NMSParam *param) {
+  assert(param->score_dim[0] == 1);
+  assert(param->box_dim[0] == 1);
+  assert (param->score_dim.size() == 3);
+  assert(param->box_dim.size() == 3);
+
+  float* outputs;
+  auto background_label = param->background_label;
+  auto nms_top_k = param->nmsTopK;
+  auto keep_top_k = param->keepTopK;
+  auto nms_threshold = param->nmsThreshold;
+  auto nms_eta = param->nmsEta;
+  auto score_threshold = param->scoreThredshold;
+
+  std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
+  std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
+  
+  std::vector<int> batch_starts = {0};
+  
+  std::map<int, std::vector<int>> indices;
+  int num_nmsed_out = 0;
+  
+  MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
+                       background_label, nms_top_k, keep_top_k, nms_threshold,
+                       nms_eta, score_threshold);
+  batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+
+  int output_size = 0;
+  int num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    outputs = new float[1];
+    outputs[0] = -1;
+    output_size = 1;
+  } else {
+    outputs = new float[num_kept * kOutputDim];
+    int64_t s = batch_starts[0];
+    int64_t e = batch_starts[1];
+    if (e > s) {
+      MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
+    }
+    output_size = num_kept * kOutputDim;
+  }
+  param->output = outputs;
+  param->output_size = output_size;
+}
+
+@implementation CPUResult
+@end
+
+@implementation NMSCompute
+
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
+  NMSParam param;
+  param.box_data = bbox;
+  param.score_data = score;
+  param.background_label = self.background_label;
+  param.scoreThredshold = self.scoreThredshold;
+  param.nmsTopK = self.nmsTopK;
+  param.keepTopK = self.keepTopK;
+  param.nmsEta = self.nmsEta;
+  param.nmsThreshold = self.nmsThreshold;
+  std::vector<int> score_dim;
+  for (int i = 0; i < self.scoreDim.count; ++i) {
+    score_dim.push_back(self.scoreDim[i].intValue);
+  }
+  param.score_dim = score_dim;
+  
+  std::vector<int> box_dim;
+  for (int i = 0; i < self.bboxDim.count; ++i) {
+    box_dim.push_back(self.bboxDim[i].intValue);
+  }
+  param.box_dim = box_dim;
+  MultiClassNMSCompute(&param);
+  CPUResult *cr = [[CPUResult alloc] init];
+  cr.output = param.output;
+  cr.outputSize = param.output_size;
+  return cr;
+}
+
+@end
+
+
--- a/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
@@ -16,95 +16,110 @@ import Foundation

 // 自定义 ?!  如果 ?! 前的返回值为一个可选值, 则进行隐式解包, 如果有值则返回这个值, 如果为nil 则fatalError 传入的信息
 precedencegroup ExecutedOrFatalError{
-    associativity: left
-    higherThan: AssignmentPrecedence
+  associativity: left
+  higherThan: AssignmentPrecedence
 }
 infix operator ?!: ExecutedOrFatalError
 public func ?!<T>(option: T?, excuteOrError: @autoclosure () -> String) -> T{
-    if let inOpt = option {
-        return inOpt
-    }else{
-        print(excuteOrError())
-        fatalError(excuteOrError())
-    }
+  if let inOpt = option {
+    return inOpt
+  }else{
+    print(excuteOrError())
+    fatalError(excuteOrError())
+  }
 }

 //Lense
 struct Lense<A, B> {
-    let from: (A) -> B
-    let to: (B, A) -> A
+  let from: (A) -> B
+  let to: (B, A) -> A
 }

 precedencegroup CombineLense{
-    associativity: left
-    higherThan: AssignmentPrecedence
+  associativity: left
+  higherThan: AssignmentPrecedence
 }

 infix operator >>>: CombineLense
 func >>><A, B, C>(left: Lense<B, C>, right: Lense<A, B>) -> Lense<A, C> {
-    return Lense<A, C>.init(from: { (a) -> C in
-        left.from(right.from(a))
-    }, to: { (c, a) -> A in
-        right.to( left.to(c, right.from(a)),a)
-    })
+  return Lense<A, C>.init(from: { (a) -> C in
+    left.from(right.from(a))
+  }, to: { (c, a) -> A in
+    right.to( left.to(c, right.from(a)),a)
+  })
 }

 protocol CIntIndex {
-    associatedtype T;
-    subscript(index: CInt) -> T { get set};
+  associatedtype T;
+  subscript(index: CInt) -> T { get set};
 }

 extension Array: CIntIndex{
-    typealias T = Element
-    subscript(index: CInt) -> T {
-        get{
-            guard Int64(Int.max) >= Int64(index) else{
-                fatalError("cint index out of Int range")
-            }
-            return self[Int(index)]
-        }
-        set{
-            guard Int64(Int.max) >= Int64(index) else{
-                fatalError("cint index out of Int range")
-            }
-            self[Int(index)] = newValue
-        }
-        
+  typealias T = Element
+  subscript(index: CInt) -> T {
+    get{
+      guard Int64(Int.max) >= Int64(index) else{
+        fatalError("cint index out of Int range")
+      }
+      return self[Int(index)]
+    }
+    set{
+      guard Int64(Int.max) >= Int64(index) else{
+        fatalError("cint index out of Int range")
+      }
+      self[Int(index)] = newValue
    }
+    
+  }
 }

 extension Array where Element: AnyObject{
-    mutating func remove(element: Element) {
-        if let index = index(where: { (node) -> Bool in
-            return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
-        }) {
-            remove(at: index)
-        }
+  mutating func remove(element: Element) {
+    if let index = index(where: { (node) -> Bool in
+      return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
+    }) {
+      remove(at: index)
    }
-    
+  }
+  
 }

 //MARK: Array extension
 extension Array where Element: Comparable{
-    
-    /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
-    ///
-    /// - Parameter r: 前 r 个元素
-    /// - Returns: [(原有位置, 排好位置的元素)]
-    public func top(r: Int) -> [(Int, Element)] {
-        precondition(r <= self.count)
-        return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
+  
+  /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
+  ///
+  /// - Parameter r: 前 r 个元素
+  /// - Returns: [(原有位置, 排好位置的元素)]
+  public func top(r: Int) -> [(Int, Element)] {
+    precondition(r <= self.count)
+    return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
+  }
+}
+
+extension Array {
+  public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
+    if count < inCount {
+      return (0..<count).map{ ($0, self[$0]) }
+    } else {
+      let stride = count / inCount
+      var newArray: [(Int, Element)] = []
+      for i in 0..<inCount {
+        newArray.append((i * stride, self[i * stride]))
+      }
+      return newArray
    }
+  }
 }

 extension String{
-    func cStr() -> UnsafePointer<Int8>? {
-        return (self as NSString).utf8String
-    }
+  func cStr() -> UnsafePointer<Int8>? {
+    return (self as NSString).utf8String
+  }
 }

 func address<T: AnyObject>(o: T) -> String {
-    return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
+  return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
 }



--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -18,263 +18,588 @@ fileprivate var defaultMetalLibrary: MTLLibrary?
 fileprivate var paddleMobileMetalLibrary: MTLLibrary?

 extension MTLDevice {
-    func defaultLibrary() -> MTLLibrary {
-        if defaultMetalLibrary == nil {
-            defaultMetalLibrary = makeDefaultLibrary()
-        }
-        if let inDefaultLib = defaultMetalLibrary {
-            return inDefaultLib
-        } else {
-            fatalError(" default metal libary is nil")
-        }
+  func defaultLibrary() -> MTLLibrary {
+    if defaultMetalLibrary == nil {
+      defaultMetalLibrary = makeDefaultLibrary()
+    }
+    if let inDefaultLib = defaultMetalLibrary {
+      return inDefaultLib
+    } else {
+      fatalError(" default metal libary is nil")
+    }
+  }
+  
+  func paddleMobileLibrary() -> MTLLibrary {
+    if paddleMobileMetalLibrary == nil {
+      guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
+        fatalError("Counld't find paddle mobile library")
+      }
+      do {
+        paddleMobileMetalLibrary = try makeLibrary(filepath: path)
+      } catch _ {
+        fatalError("Counld't load paddle mobile library")
+      }
    }
    
-    func paddleMobileLibrary() -> MTLLibrary {
-        if paddleMobileMetalLibrary == nil {
-            guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
-                fatalError("Counld't find paddle mobile library")
-            }
-            do {
-                paddleMobileMetalLibrary = try makeLibrary(filepath: path)
-            } catch _ {
-                fatalError("Counld't load paddle mobile library")
-            }
-        }
-        
-        if let inPaddleMobileLib = paddleMobileMetalLibrary {
-            return inPaddleMobileLib
-        } else {
-            fatalError("PaddleMobile metal libary is nil")
-        }
+    if let inPaddleMobileLib = paddleMobileMetalLibrary {
+      return inPaddleMobileLib
+    } else {
+      fatalError("PaddleMobile metal libary is nil")
+    }
+  }
+  
+  func pipeLine(funcName: String, inPaddleMobileLib: Bool = true) -> MTLComputePipelineState {
+    let useLib = inPaddleMobileLib ? paddleMobileLibrary() : defaultLibrary()
+    guard let function = useLib.makeFunction(name: funcName) else {
+      fatalError(" function " + funcName + " not found")
+    }
+    do {
+      let pipLine = try makeComputePipelineState(function: function)
+      return pipLine
+    } catch let error {
+      print(error)
+      fatalError("make pip line error occured : \(error)")
    }
    
-    func pipeLine(funcName: String, inPaddleMobileLib: Bool = true) -> MTLComputePipelineState {
-        let useLib = inPaddleMobileLib ? paddleMobileLibrary() : defaultLibrary()
-        guard let function = useLib.makeFunction(name: funcName) else {
-            fatalError(" function " + funcName + " not found")
-        }
-        do {
-            let pipLine = try makeComputePipelineState(function: function)
-            return pipLine
-        } catch _ {
-            fatalError("make pip line error occured")
-        }
-        
+  }
+  
+  func makeBuffer<P>(value: [P]) -> MTLBuffer {
+    let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
+    let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
+    for i in 0..<value.count {
+      contents?[i] = value[i]
+    }
+    return buffer!
+  }
+  
+  func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
+    let bpR = texture.width * 4 * MemoryLayout<P>.size
+    let bpI = texture.height * bpR
+    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
+    for i in 0..<texture.arrayLength {
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
+      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+      for tx in 0..<texture.width * texture.height * 4 {
+        var k = tx
+        var xyzn: [Int] = [0, 0, 0, 0]
+        xyzn[1] = k / (texture.width * 4)
+        k %= (texture.width * 4)
+        xyzn[3] = k % 4
+        xyzn[0] = k / 4
+        xyzn[2] = i
+        cb(xyzn, pointer[tx])
+      }
    }
+  }
+  
+  func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 3)
+    assert(texture.width == ndim[3])
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(texture.arrayLength == (ndim[1] + 3) / 4)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[2] * 4 + xyzn[3]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 2)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(texture.arrayLength == 1)
    
-    func makeBuffer<P>(value: [P]) -> MTLBuffer {
-        let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
-        let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
-        for i in 0..<value.count {
-            contents?[i] = value[i]
-        }
-        return buffer!
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 1)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == 1)
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(ndim[2] == 1)
+    assert(texture.arrayLength == 1)
    
-    func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
-        
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.width = textureWidth
-        textureDesc.height = textureHeight
-        textureDesc.depth = 1
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.textureType = .type2DArray
-        textureDesc.storageMode = .shared
-        textureDesc.cpuCacheMode = .defaultCache
-        textureDesc.arrayLength = arrayLength
-        let texture = makeTexture(descriptor: textureDesc)!
-        
-        if arrayLength == 1 && value.count >= 4{
-            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: value.count * MemoryLayout<P>.size)
-            for i in 0..<value.count {
-                pointer[i] = value[i]
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    if dim.count == 3 {
+      return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 2 {
+      return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 1 {
+      return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
+    }
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    
+    assert(texture.width == ndim[2])
+    assert(texture.height == ndim[1])
+    assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
+    
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[1]
+      tg[2] = xyzn[0]
+      tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
+      tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
+    if value.count > 0 {
+      assert(value.count == dim.reduce(1) { $0 * $1 })
+    }
+    
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = ndim[2]
+    textureDesc.height = ndim[1]
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    
+    if inComputePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if inComputePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    }
+    
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
+    let texture = makeTexture(descriptor: textureDesc)!
+    
+    if value.count > 0 {
+      var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
+      rcount = rcount * 4 * ndim[1] * ndim[2]
+      var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
+      
+      for i0 in 0..<tdim[0] {
+        for i1 in 0..<tdim[1] {
+          for i2 in 0..<tdim[2] {
+            for i3 in 0..<tdim[3] {
+              let ig = [i0, i1, i2, i3]
+              let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
+              
+              let jg = transpose.map { ig[$0] }
+              let k = jg[0] * ndim[3] + jg[3]
+              let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
+              
+              nvalue[jx] = value[ix] as! Float32
            }
-            
-            let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
-            texture.replace(region: region, mipmapLevel: 0, withBytes: pointer, bytesPerRow: bytesPerRow)
-        } else {
-            
-            
-            
+          }
        }
-        
-        return texture
+      }
+      
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+      if inComputePrecision == .Float16 {
+        let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+        let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
+        float32ToFloat16(input: pointer, output: outputP, count: rcount)
+        let bpR = ndim[2] * 4 * 2
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = outputP + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
+      } else {
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+        let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = pointer + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
+      }
    }
+    return texture
+  }
+  
+  func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
+    
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = textureWidth
+    textureDesc.height = textureHeight
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.pixelFormat = .rgba32Float
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = arrayLength
+    let texture = makeTexture(descriptor: textureDesc)!
+    
+    if value.count >= 4{
+      let counts = arrayLength * 4 * textureWidth * textureHeight
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
+      for i in 0..<value.count {
+        pointer[i] = value[i]
+      }
+      for i in value.count..<counts {
+        pointer[i] = 0 as! P
+      }
+      
+      let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
+      let bytesPerImage = texture.height * bytesPerRow
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
+      for i in 0..<arrayLength {
+        let p = pointer + texture.width * texture.height * 4 * i
+        texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
+      }
+    } else {
+      
+    }
+    
+    return texture
+  }
 }

 extension MTLComputeCommandEncoder {
-    func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
-        let slices = (outTexture.arrayLength * 4 + 3)/4
-        
-        let width = computePipline.threadExecutionWidth
-        let height = computePipline.maxTotalThreadsPerThreadgroup/width
-        let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
-        
-//        print(" thread: threads per group: \(threadsPerGroup) ")
-//        print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
-        
-        let groupWidth = (outTexture.width + width - 1)/width
-        let groupHeight = (outTexture.height + height - 1)/height
-        let groupDepth = slices
-        let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
-        
-//        print("groups: \(groups) ")
-//        print("threads per group: \(threadsPerGroup)")
-        
-        setComputePipelineState(computePipline)
-        
-        dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
-    }
+  public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
+    let slices = (outTexture.arrayLength * 4 + 3)/4
+    
+    let width = computePipline.threadExecutionWidth
+    let height = computePipline.maxTotalThreadsPerThreadgroup/width
+    let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
+    
+//    print(" thread: threads per group: \(threadsPerGroup) ")
+//    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
+    
+    let groupWidth = (outTexture.width + width - 1)/width
+    let groupHeight = (outTexture.height + height - 1)/height
+    let groupDepth = slices
+    let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
+    
+    setComputePipelineState(computePipline)
+    
+    dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
+  }
 }

-
 public extension MTLTexture {
-    
-    func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
-        var arr: [P] = floatArray { (p: P) -> P in
-            return p;
+  
+  func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
+    var arr: [P] = floatArray { (p: P) -> P in
+      return p;
+    }
+    var result:  [(index: Int, value: P)] = []
+    if arr.count > 100 && stridable {
+      for j in stride(from: 0, to: arr.count , by: arr.count / 100){
+        result.append((j, arr[j]))
+      }
+    } else {
+      for j in 0..<arr.count {
+        result.append((j, arr[j]))
+      }
+    }
+    return result
+  }
+  
+  func floatArray<P, T>(res: (P) -> T) -> [T] {
+    var fArr: [T] = []
+    if textureType == .type2DArray {
+      for i in 0..<arrayLength{
+        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+        let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+        let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+        let p = bytes.assumingMemoryBound(to: P.self)
+        
+        for j in 0..<width * height * depth * 4 {
+          fArr.append(res(p[j]))
        }
-        var result:  [(index: Int, value: P)] = []
-        if arr.count > 100 && stridable {
-            for j in stride(from: 0, to: arr.count , by: arr.count / 100){
-                result.append((j, arr[j]))
-            }
+        bytes.deallocate()
+      }
+    } else if textureType == .type2D {
+      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+      let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+      let p = bytes.assumingMemoryBound(to: P.self)
+      
+      for j in 0..<width * height * 4 {
+        fArr.append(res(p[j]))
+      }
+      bytes.deallocate()
+    }
+    return fArr
+  }
+  
+  func float32Array() -> [Float32] {
+    if pixelFormat == .rgba32Float {
+      let float32Array = floatArray { (f: Float32) -> Float32 in
+        return f
+      }
+      return float32Array
+    } else if pixelFormat == .rgba16Float {
+      
+      var float16Array = floatArray { (f: Float16) -> Float16 in
+        return f
+      }
+      return float16To32(input: &float16Array, count: float16Array.count)
+    } else {
+      fatalError()
+    }
+  }
+  
+  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+    print(header)
+    print("texture: \(self)")
+    //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
+    //        print(res)
+    
+    if textureType == .type2DArray {
+      for i in 0..<arrayLength{
+        var str: String = "slice: \(i): \n"
+        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+        let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+        let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+        let p = bytes.assumingMemoryBound(to: T.self)
+        str += "2d array count : \(width * height * depth * 4) \n"
+        if stridable && width * height * depth * 4 > 20 {
+          for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
+            str += " index \(j): \(p[j])"
+          }
        } else {
-            for j in 0..<arr.count {
-                result.append((j, arr[j]))
+          for j in 0..<width * height * depth * 4 {
+            str += " index \(j): \(p[j])"
+          }
+        }
+        
+        bytes.deallocate()
+        print(str)
+      }
+    } else if textureType == .type2D {
+      var str: String = "texture 2D: "
+      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+      let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+      let p = bytes.assumingMemoryBound(to: T.self)
+      str += "2d count : \(width * width * 4) \n"
+      
+      if stridable {
+        for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
+          str += "index \(j): \(p[j]) "
+        }
+      } else {
+        for j in 0..<width * height * 4 {
+          str += "index \(j): \(p[j]) "
+        }
+      }
+      
+      print(str)
+      bytes.deallocate()
+    }
+    return nil
+    
+  }
+  
+  // n c h w - dim
+  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
+    var textureArray: [Float32]
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
+    }
+    
+    var output: [Float32] = []
+    for s in 0..<arrayLength {
+      for c in 0..<4{
+        for h in 0..<dim.h {
+          for w in 0..<dim.w {
+            if (s * 4 + c) < dim.c {
+              let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
+              output.append(textureValue)
            }
+          }
        }
-        return result
+      }
+    }
+    return output
+  }
+  
+  func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
+//    print("origin dim: \(dim)")
+//    print("texture: ")
+//    print(self)
+    
+    var textureArray: [Float32]
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
    }
    
-    func floatArray<P, T>(res: (P) -> T) -> [T] {
-        var fArr: [T] = []
-        if textureType == .type2DArray {
-            for i in 0..<arrayLength{
-                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-                let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-                let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
-                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-                let p = bytes.assumingMemoryBound(to: P.self)
-               
-                for j in 0..<width * height * depth * 4 {
-                    fArr.append(res(p[j]))
-                }
-                bytes.deallocate()
+    var output: [Float32] = []
+    let numOfASlice = dim.h * dim.w * 4
+    for h in 0..<dim.h {
+      for w in 0..<dim.w {
+        for sliceIndex in 0..<arrayLength {
+          if sliceIndex * 4 + 4 > dim.c {
+            for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) {
+              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+              output.append(value)
            }
-        } else if textureType == .type2D {
-            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-            let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-            let p = bytes.assumingMemoryBound(to: P.self)
-
-            for j in 0..<width * height * 4 {
-                fArr.append(res(p[j]))
+          } else {
+            for i in 0..<4 {
+              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+              output.append(value)
            }
-            bytes.deallocate()
+          }
        }
-        return fArr
+      }
    }
-    
-    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-        print(header)
-        print("texture: \(self)")
-        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
-        print(res)
+    return output
+  }
  
-//        if textureType == .type2DArray {
-//            for i in 0..<arrayLength{
-//                var str: String = "slice: \(i): \n"
-//                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-//                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-//                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
-//                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-//                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-//                let p = bytes.assumingMemoryBound(to: T.self)
-//                str += "2d array count : \(width * height * depth * 4) \n"
-//                if stridable && width * height * depth * 4 > 100 {
-//                    for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 100){
-//                        str += " index \(j): \(p[j])"
-//                    }
-//                } else {
-//                    for j in 0..<width * height * depth * 4 {
-//                        str += " index \(j): \(p[j])"
-//                    }
-//                }
-//
-//                bytes.deallocate()
-//                print(str)
-//            }
-//        } else if textureType == .type2D {
-//            var str: String = "texture 2D: "
-//            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-//            let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-//            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-//            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-//            let p = bytes.assumingMemoryBound(to: T.self)
-//            str += "2d count : \(width * width * 4) \n"
-//
-//            if stridable {
-//                for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 100){
-//                    str += "index \(j): \(p[j]) "
-//                }
-//            } else {
-//                for j in 0..<width * height * 4 {
-//                    str += "index \(j): \(p[j]) "
-//                }
-//            }
-//
-//            print(str)
-//            bytes.deallocate()
-//        }
-        return nil
-           
-    }
 }


 public extension MTLBuffer {
-    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-        print(header)
-        print("MTLBuffer: \(self) ")
-        var str = ""
-        if stridable && length/MemoryLayout<T>.stride > 1000{
-            for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
-                str += " \(contents().assumingMemoryBound(to: T.self)[j])"
-            }
-        } else {
-            for i in 0..<length/MemoryLayout<T>.size {
-                str += " \(contents().assumingMemoryBound(to: T.self)[i])"
-            }
-        }
-        print(str)
-        return nil
+  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+    print(header)
+    print("MTLBuffer: \(self) ")
+    var str = ""
+    if stridable && length/MemoryLayout<T>.stride > 1000{
+      for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
+        str += " \(contents().assumingMemoryBound(to: T.self)[j])"
+      }
+    } else {
+      for i in 0..<length/MemoryLayout<T>.size {
+        str += " \(contents().assumingMemoryBound(to: T.self)[i])"
+      }
    }
-    
-    func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.width = textureWidth
-        textureDesc.height = textureHeight
-        textureDesc.depth = 1
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.textureType = .type2DArray
-        textureDesc.storageMode = .shared
-        textureDesc.cpuCacheMode = .defaultCache
-        textureDesc.arrayLength = arrayLength
-        let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
-        return texture
+    print(str)
+    return nil
+  }
+  
+  func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = textureWidth
+    textureDesc.height = textureHeight
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.pixelFormat = .rgba32Float
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = arrayLength
+    let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
+    return texture
+  }
+  
+  func array<T>() -> [T] {
+    var array: [T] = []
+    let pointer = contents().bindMemory(to: T.self, capacity: length)
+    for i in 0..<(length / MemoryLayout<T>.size) {
+      array.append(pointer[i])
    }
-    
-    
-
+    return array;
+  }
 }

-
-
-
-
--- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
-//
-//  TestConvAddBatchNormRelu.swift
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/7/25.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import Metal
 import Foundation
@@ -17,6 +23,204 @@ public class PaddleMobileUnitTest {
        queue = inQueue
    }
    
+    private func indentPrintTensor(tensor: [Float32], dim: [Int], ix: [Int], indentLevel: Int) {
+        let indent = Array.init(repeating: " ", count: indentLevel).joined(separator: "")
+        var tx = ix
+        if dim.count == indentLevel + 1 {
+            var log: String = indent + "["
+            for i in 0..<dim[indentLevel] {
+                tx = ix
+                tx[indentLevel] = i
+                for x in 1..<dim.count {
+                    for y in 0..<x {
+                        tx[y] *= dim[x]
+                    }
+                }
+                let c = tx.reduce(0) { $0 + $1 }
+                if i > 0 {
+                    log += ", "
+                }
+                log += tensor[c].description
+            }
+            log += "]"
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                log += ","
+            }
+            print(log)
+        } else {
+            print(indent + "[")
+            for i in 0..<dim[indentLevel] {
+                tx[indentLevel] = i
+                indentPrintTensor(tensor: tensor, dim: dim, ix: tx, indentLevel: indentLevel + 1)
+            }
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                print(indent + "],")
+            } else {
+                print(indent + "]")
+            }
+        }
+    }
+    
+    private func tensorPrint(tensor: [Float32], dim: [Int]) {
+        var detectPos = -1
+        var odim = 1
+        var ndim = dim
+        for i in 0..<dim.count {
+            if dim[i] == -1 {
+                if detectPos == -1 {
+                    detectPos = i
+                } else {
+                    detectPos = -2
+                }
+            } else if dim[i] <= 0 {
+                detectPos = -3
+            } else {
+                odim *= dim[i]
+            }
+        }
+        assert(detectPos >= -1)
+        if (detectPos == -1) {
+            assert(tensor.count == odim)
+        } else {
+            assert(tensor.count % odim == 0)
+            ndim[detectPos] = tensor.count / odim
+        }
+        indentPrintTensor(tensor: tensor, dim: ndim, ix: dim.map { $0 * 0 }, indentLevel: 0)
+    }
+    
+    public func testConcat() {
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        var it: [[Float32]] = []
+//        for _ in 0..<7 {
+//            it.append((0..<12).map { Float32($0) })
+//        }
+//        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
+//        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
+//
+//        let param = ConcatTestParam.init(
+//            input: input,
+//            output: output,
+//            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
+//            axis: 1,
+//            odim: [3, 28]
+//        )
+//        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
+//        concatKernel.test(cmdBuffer: buffer, param: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            for i in 0..<it.count {
+//                let _: Float32? = input[i].logDesc()
+//                self.tensorPrint(tensor: it[i], dim: [3, 4])
+//            }
+//            let _: Float32? = output.logDesc()
+//            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
+//            self.tensorPrint(tensor: tx, dim: [3, 28])
+//        }
+//
+//        buffer.commit()
+    }
+    
+    public func testReshape() {
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
+//        let mp = ReshapeMetalParam.init(
+//            idim: (1, 2, 3, 4),
+//            itrans: (0, 1, 2, 3),
+//            odim: (1, 1, 4, 6),
+//            otrans: (0, 1, 2, 3)
+//        )
+//        let param = ReshapeTestParam.init(
+//            inputTexture: inTexture,
+//            outputTexture: outTexture,
+//            param: mp
+//        )
+//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inTexture.logDesc()
+//            let _: Float32? = outTexture.logDesc()
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
+//            self.tensorPrint(tensor: tx, dim: [4, 6])
+//        }
+        
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
+//        let mp = ReshapeMetalParam.init(
+//            idim: (1, 2, 3, 4),
+//            itrans: (0, 1, 2, 3),
+//            odim: (1, 1, 1, 24),
+//            otrans: (0, 1, 2, 3)
+//        )
+//        let param = ReshapeTestParam.init(
+//            inputTexture: inTexture,
+//            outputTexture: outTexture,
+//            param: mp
+//        )
+//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inTexture.logDesc()
+//            let _: Float32? = outTexture.logDesc()
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
+//            self.tensorPrint(tensor: tx, dim: [24])
+//        }
+//
+//        
+//        buffer.commit()
+    }
+    
+    public func testTranspose() {
+
+        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        var input: [Float32] = []
+//        for i in 0..<72 {
+//            input.append(Float32(i))
+//        }
+////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
+//        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
+//        // group 1
+//        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
+////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
+////        // group 2
+////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
+////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
+////
+//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//
+//        transposeKernel.test(commandBuffer: buffer, param: param)
+//
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+//            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
+//            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
+//        }
+//
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
+//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//
+//        transposeKernel.test(commandBuffer: buffer, param: param)
+//
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
+//            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
+//        }
+//        
+        buffer.commit()
+    }
+    
    public func testConvAddBnRelu() {
        let buffer = queue.makeCommandBuffer() ?! " buffer is nil "
        
@@ -116,7 +320,7 @@ public class PaddleMobileUnitTest {
        let offsetX = filterSize.width/2 - paddings.0
        let offsetY = filterSize.height/2 - paddings.1
        
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0))
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1))
        
        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
        
@@ -132,16 +336,6 @@ public class PaddleMobileUnitTest {
        }
        
        buffer.commit()
-        
-        
-//        let inputTexture = device.makeFloatTexture(value: <#T##[P]#>, textureWidth: <#T##Int#>, textureHeight: <#T##Int#>, arrayLength: <#T##Int#>)
-        
-        
-//        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: <#T##MTLTexture#>, inOutputTexture: <#T##MTLTexture#>, inMetalParam: <#T##MetalConvParam#>, inFilterBuffer: <#T##MTLBuffer#>, inBiaseBuffer: <#T##MTLBuffer#>, inNewScaleBuffer: <#T##MTLBuffer#>, inNewBiaseBuffer: <#T##MTLBuffer#>, inFilterSize: <#T##(width: Int, height: Int, channel: Int)#>)
-        
-//        ConvAddBatchNormReluKernel.init(device: <#T##MTLDevice#>, testParam: <#T##ConvAddBatchNormReluTestParam#>)
-        
-        
    }
 }


--- a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
-//
-//  Tools.swift
-//  paddle-mobile
-//
-//  Created by liuRuiLong on 2018/7/26.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import Foundation


--- a/metal/paddle-mobile/paddle-mobile/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
@@ -13,80 +13,228 @@
 limitations under the License. */

 import Foundation
+import Accelerate

 public protocol SummableMultipliable: Equatable {
-    static func +(lhs: Self, rhs: Self) -> Self
-    static func *(lhs: Self, rhs: Self) -> Self
-    static func -(lhs: Self, rhs: Self) -> Self
+  static func +(lhs: Self, rhs: Self) -> Self
+  static func *(lhs: Self, rhs: Self) -> Self
+  static func -(lhs: Self, rhs: Self) -> Self
 }
 public protocol PrecisionType: SummableMultipliable{
-    init(inFloat: Float32)
-    init(inFloat16: Float16)
-    init<P: PrecisionType>(_ inP: P)
-    static var bitSize: UInt { get }
+  init(inFloat: Float32)
+  init(inFloat16: Float16)
+  init<P: PrecisionType>(_ inP: P)
+  static var bitSize: UInt { get }
 }

 public typealias Float16 = Int16
 extension Float16: PrecisionType {
-    public static func * (prefix: Float16, postfix: Float16) {
-        return prefix * postfix
+  public static func * (prefix: Float16, postfix: Float16) {
+    return prefix * postfix
+  }
+  
+  public init<P>(_ inP: P) where P : PrecisionType {
+    if P.bitSize == Float32.bitSize {
+      self = Float16(inFloat: inP as! Float32)
+    } else if P.bitSize == Float16.bitSize {
+      self = inP as! Float16
+    } else {
+      fatalError()
    }
-    
-    public init<P>(_ inP: P) where P : PrecisionType {
-        if P.bitSize == Float32.bitSize {
-            self = Float16(inFloat: inP as! Float32)
-        } else if P.bitSize == Float16.bitSize {
-            self = inP as! Float16
-        } else {
-            fatalError()
+  }
+  
+  public static var bitSize: UInt {
+    return 16
+  }
+  
+  public init(inFloat16: Float16) {
+    self = inFloat16
+  }
+  public init(inFloat: Float32) {
+    self = Int16(inFloat)
+  }
+}
+
+extension Float32: PrecisionType {
+  public init<P>(_ inP: P) where P : PrecisionType {
+    if P.bitSize == Float32.bitSize {
+      self = inP as! Float32
+    } else if P.bitSize == Float16.bitSize {
+      self = Float32.init(inP as! Float16)
+    } else {
+      fatalError()
+    }
+  }
+  
+  public init(inFloat: Float32) {
+    self = inFloat
+  }
+  
+  public init(inFloat16: Float16) {
+    self = Float32.init(inFloat16)
+  }
+  
+  public static var bitSize: UInt {
+    return 32
+  }
+}
+
+public func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
+  var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+  var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+  guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+    fatalError(" float 32 to float 16 error ! ")
+  }
+}
+
+public func float16To32(input: UnsafeMutablePointer<Float16>, count: Int) -> [Float32] {
+  var output = Array<Float>.init(repeating: 0.0, count: count)
+  float16to32(input: input, output: &output, count: count)
+  return output
+}
+
+public func float16to32(input: UnsafeMutablePointer<Float16>, output: UnsafeMutablePointer<Float32>, count: Int) {
+  var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
+  var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
+  if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
+    fatalError(" convert float16 to float32 error")
+  }
+}
+
+// N - 0   C - 1   H - 2   W - 3
+struct DataLayout {
+  
+  static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+    return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
+  }
+  
+  static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+    return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
+  }
+  
+  func count() -> Int {
+    return layoutWithDim.count
+  }
+  
+  var N: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .N {
+          return layoutDim.1
        }
+      }
+      return nil
    }
-    
-    public static var bitSize: UInt {
-        return 16
+    set {
+      var newN = (Layout.N, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .N
+      }) {
+        fatalError()
+      }
    }
-    
-    public init(inFloat16: Float16) {
-        self = inFloat16
+  }
+  var C: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .C {
+          return layoutDim.1
+        }
+      }
+      return nil
    }
-    public init(inFloat: Float32) {
-        self = Int16(inFloat)
+    set {
+      var newN = (Layout.C, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .N
+      }) {
+        fatalError()
+      }
    }
-    
-    
-    
-}
-
-extension Float32: PrecisionType {
-    public init<P>(_ inP: P) where P : PrecisionType {
-        if P.bitSize == Float32.bitSize {
-            self = inP as! Float32
-        } else if P.bitSize == Float16.bitSize {
-            self = Float32.init(inP as! Float16)
-        } else {
-            fatalError()
+  }
+  var H: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .H {
+          return layoutDim.1
        }
+      }
+      return nil
    }
-    
-    public init(inFloat: Float32) {
-        self = inFloat
+    set {
+      var newN = (Layout.H, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .H
+      }) {
+        fatalError()
+      }
    }
-    
-    public init(inFloat16: Float16) {
-        self = Float32.init(inFloat16)
+  }
+  var W: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .W {
+          return layoutDim.1
+        }
+      }
+      return nil
+    }
+    set {
+      var newN = (Layout.W, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .W
+      }) {
+        fatalError()
+      }
    }
+  }
+  
+  
+  init(_ inLayout: [(Layout, Int)]) {
+    layoutWithDim = inLayout
+  }
+  
+  func layout() -> [Layout] {
+    return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
+      return layout
+    })
+  }
+  
+  var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
+  
+  func convertTo(inLayout: [Layout]) {
    
-    public static var bitSize: UInt {
-        return 32
+  }
+  
+  enum Layout: Int{
+    case N = 0
+    case C = 1
+    case H = 2
+    case W = 3
+    static func defaultLayout() -> [Layout] {
+      return [N, C, H, W]
    }
+  }
 }

-public enum DataLayout {
-    case NCHW
-    case NHWC
+extension DataLayout: Equatable {
+  public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
+    if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
+      var result = true
+      for i in 0..<lhs.layoutWithDim.count {
+        result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
+        if !result {
+          break
+        }
+      }
+      return result
+    } else {
+      return false
+    }
+  }
 }

-protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
+public protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
 }

 extension Tensor: Variant {
@@ -95,12 +243,52 @@ extension Tensor: Variant {
 extension Texture: Variant {
 }

-extension ResultHolder: Variant {
+extension GPUResultHolder: Variant {
 }

 extension InputTexture: Variant {
 }

 extension MTLTexture where Self: Variant {
-    
+  
 }
+
+class FetchHolder: Variant {
+  var resultBuffer: MTLBuffer?
+  var dim: [Int]
+  var capacity: Int
+  
+  init(inCapacity: Int, inDim: [Int]) {
+    capacity = inCapacity
+    dim = inDim
+  }
+  
+  func initBuffer(device: MTLDevice) {
+    resultBuffer = device.makeBuffer(length: capacity * 4, options: [])
+  }
+  
+  var result: UnsafeMutablePointer<Float32> {
+    guard let inResultBuffer = resultBuffer else {
+      fatalError()
+    }
+    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: capacity)
+  }
+  
+}
+
+extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  var debugDescription: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public class ResultHolder<P: PrecisionType> {
-    public let dim: [Int]
-    public let resultArr: [P]
-    public let elapsedTime: Double
-    public init(inDim: [Int], inResult: [P], inElapsedTime: Double) {
-        dim = inDim
-        resultArr = inResult
-        elapsedTime = inElapsedTime
-    }
-}
-
-extension ResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
-    public var debugDescription: String {
-        var str = ""
-        str += "Dim: \(dim) \n value:[ "
-        if resultArr.count < 20 {
-            for d in resultArr {
-                str += " \(d) "
-            }
-        } else {
-            for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
-                str += " \(resultArr[d]) "
-            }
-        }
-        str += " ]"
-        return str
-    }
-    
-    public var description: String {
-        return debugDescription
-    }
-}
-
-public class Executor<P: PrecisionType> {
-    var ops: [Runable & InferShaperable] = []
-    let program: Program
-    let device: MTLDevice
-    let queue: MTLCommandQueue
-    public init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program) throws {
-        program = inProgram
-        device = inDevice
-        queue = inQueue
-        for block in inProgram.programDesc.blocks {
-            //block.ops.count
-            for i in 0..<block.ops.count {
-                let op = block.ops[i]
-                do {
-                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-                    op.inferShape()
-                    ops.append(op)
-                } catch let error {
-                    throw error
-                }
-            }
-            
-//            for op in block.ops {
-//                do {
-//                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-//                    op.inferShape()
-//                    ops.append(op)
-//                } catch let error {
-//                    throw error
-//                }
-//            }
-        }
-    }
-    
-    public func predict(input: MTLTexture, expect: [Int], completionHandle: @escaping (ResultHolder<P>) -> Void, preProcessKernle: CusomKernel? = nil) throws {
-        guard let buffer = queue.makeCommandBuffer() else {
-            throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
-        }
-        let resInput: MTLTexture
-        if let inPre = preProcessKernle {
-            do {
-                try inPre.compute(inputTexuture: input, commandBuffer: buffer)
-                resInput = inPre.outputTexture
-            } catch let error {
-                throw error
-            }
-        } else {
-            resInput = input
-        }
-        
-        let beforeDate = Date.init()
-        let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: expect))
-        program.scope.setInput(input: inputTexture)
- 
-        for op in ops {
-            do {
-                try op.run(device: device, buffer: buffer)
-            } catch let error {
-                throw error
-            }
-        }
-        
-        buffer.addCompletedHandler { (commandbuffer) in
-//            let inputArr = resInput.floatArray(res: { (p:P) -> P in
-//                return p
-//            })
-//            print(inputArr)
-            
-//            let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
-//            print(stridableInput)
-            
-//            let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-//            for op in self.ops {
-//                op.delogOutput()
-//            }
-//            return
-            
-//            self.ops[2].delogOutput()
-            
-            
-            let afterDate = Date.init()
-            
-            guard let outputVar = self.program.scope.output() else {
-                fatalError("output nil")
-            }
-
-            guard let output = outputVar as? Texture<P> else {
-                fatalError("output var type error")
-            }
-            let resultHodlder = ResultHolder<P>.init(inDim: output.dim.dims, inResult: output.metalTexture.floatArray(res: { (p:P) -> P in
-                return p
-            }), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
-            completionHandle(resultHodlder)
-        }
-        buffer.commit()
-    }
-    
-    public func clear() {
-        program.scope.clear()
-    }
-    
-}
-
-//public let paddle_executor: Executor = Executor.init()
--- a/metal/paddle-mobile/paddle-mobile/Genet.swift
+++ b/metal/paddle-mobile/paddle-mobile/Genet.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class Genet: Net {
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [128.0, 128.0, 128.0]
+    scale = 0.017
+    except = 0
+    modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = GenetPreProccess.init(device: device)
+    dim = (n: 1, h: 128, w: 128, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [128.0, 128.0, 128.0]
+    scale = 0.017
+    except = 0
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = GenetPreProccess.init(device: device)
+    dim = (n: 1, h: 128, w: 128, c: 3)
+  }
+
+  class GenetPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
+      super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override  public func resultStr(res: ResultHolder) -> String {
+//    fatalError()
+    return " \(res.result![0]) ... "
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Loader.swift
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import SwiftProtobuf
-
-public class Loader<P: PrecisionType> {
-    class ParaLoader {
-        let file: UnsafeMutablePointer<FILE>
-        let fileSize: Int
-        var nowIndex: Int
-        init(paramPath: String) throws {
-            guard let tmpFile = fopen(paramPath, "rb") else {
-                throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
-            }
-            file = tmpFile
-            fseek(file, 0, SEEK_END)
-            fileSize = ftell(file)
-            guard fileSize > 0 else {
-                throw PaddleMobileError.loaderError(message: "param file size is too small")
-            }
-            rewind(file)
-            nowIndex = 0
-        }
-        
-        func read(tensor: Tensor<P>) throws {
-            guard nowIndex <= fileSize else {
-                throw PaddleMobileError.loaderError(message: "out of the file range")
-            }
-            
-            func pointerReader<T>(type: T.Type) -> T {
-                let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
-                fread(ptr, 1, MemoryLayout<T>.size, file)
-                nowIndex += MemoryLayout<T>.size
-                let pointee = ptr.pointee
-                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-                ptr.deallocate()
-                return pointee
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            let lodLevel = pointerReader(type: UInt64.self)
-            for _ in 0..<lodLevel {
-                let size = pointerReader(type: UInt64.self)
-                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-                    _ = pointerReader(type: size_t.self)
-                }
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            
-            let tensorDescSize = pointerReader(type: Int32.self)
-            
-            fseek(file, Int(tensorDescSize), SEEK_CUR)
-            nowIndex += Int(tensorDescSize)
-            
-            /*
-             这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
-             */
-            
-            //现在模型传入模型为  Float 类型, 这块应该根据模型来
-//            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-//            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
-            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
-            
-            guard bytesRead == tensor.data.size else {
-                throw PaddleMobileError.loaderError(message: "param read size error")
-            }
-            
-            // TODO: use script to convert
-//            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-//            for i in 0..<tensor.numel() {
-//                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-//            }
-//            tmpPointer.deinitialize(count: tmpCapacity)
-//            tmpPointer.deallocate()
-            
-            nowIndex += bytesRead
-        }
-        
-        deinit {
-            fclose(file)
-        }
-    }
-    public init(){}
-    public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
-        guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
-            throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
-        }
-        
-        do {
-            let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
-                serializedData: modelData)
-            
-            let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
-            let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
-            print(programDesc)
-
-            guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
-                throw PaddleMobileError.loaderError(message: "load para error")
-            }
-            
-            guard programDesc.blocks.count > 0 else {
-                throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
-            }
-            
-            // to get feed key and fetch key
-            let block = programDesc.blocks[0]
-            guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
-                throw PaddleMobileError.loaderError(message: "at least two operator")
-            }
-            guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
-                throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
-            }
-            
-            guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
-                throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
-            }
-            guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
-                throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
-            }
-            
-            let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
-            
-            // to load memory
-            for block in programDesc.blocks {
-                for varDesc in block.vars {
-                    if (varDesc.type == .LodTensor) {
-                        guard let tensorDesc = varDesc.tensorDesc else {
-                            throw PaddleMobileError.loaderError(message: "get tensor desc failed")
-                        }
-                        
-//                        guard (try? tensorDesc.dataType.dataTypeSize()) == MemoryLayout<P>.size else {
-//                            throw PaddleMobileError.memoryError(message: "PrecisionType not support")
-//                        }
-                        
-                        if (varDesc.persistable
-                            && varDesc.type != .FeedMiniBatch
-                            && varDesc.type != .FetchList) {
-                            let dimArr = tensorDesc.dims
-                            
-                            guard dimArr.count > 0 else {
-                                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
-                            }
-                            
-                            let dim = Dim.init(inDim: dimArr)
-                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
-                            do {
-                                try paraLoader.read(tensor: tensor)
-                            } catch let error {
-                                throw error
-                            }
-                            tensor.convert(to: .NHWC)
-//                            tensor.initBuffer(device: device)
-                            scope[varDesc.name] = tensor
-                        } else {
-                            let dim = Dim.init(inDim: tensorDesc.NHWCDim)
-                            scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
-                        }
-                    } else {
-                        if varDesc.name == fetchKey {
-                            scope[varDesc.name] = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: 0.0)
-                        } else if varDesc.name == feedKey {
-                        }
-                    }
-                }
-            }
-            
-            let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope)
-            
-            return program
-        } catch _ {
-            throw PaddleMobileError.loaderError(message: "protobuf decoder error")
-        }
-    }
-}
--- a/metal/paddle-mobile/paddle-mobile/MobileNet.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MobileNet: Net{
+  
+  class MobilenetPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  class PreWords {
+    var contents: [String] = []
+    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+        let string = try! String.init(contentsOfFile: filePath)
+        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+        }
+      }else{
+        fatalError("no file call \(fileName)")
+      }
+    }
+    subscript(index: Int) -> String {
+      return contents[index]
+    }
+  }
+  
+  let labels = PreWords.init(fileName: "synset")
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    guard let resPointer = res.result else {
+      fatalError()
+    }
+    var s: [String] = []
+    (0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    }
+    return s.joined(separator: "\n")
+  }
+  
+
+  
+  override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 0
+    modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetPreProccess.init(device: device)
+    dim = (n: 1, h: 224, w: 224, c: 3)
+  }
+}
+
--- a/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class MobileNet_ssd_hand: Net{
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 2
+    modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 300, w: 300, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 2
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 300, w: 300, c: 3)
+  }
+  
+  class MobilenetssdPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    return " \(res)"
+  }
+  
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+
+//    guard let interRes = paddleMobileRes.intermediateResults else {
+//      fatalError(" need have inter result ")
+//    }
+//
+//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+//      fatalError(" need score ")
+//    }
+//
+//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+//      fatalError()
+//    }
+//
+//    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
+////    print("score: ")
+////    print(scoreFormatArr.strideArray())
+////
+//    var bboxArr = bbox.metalTexture.float32Array()
+////    print("bbox: ")
+////    print(bboxArr.strideArray())
+//
+//    let nmsCompute = NMSCompute.init()
+//    nmsCompute.scoreThredshold = 0.01
+//    nmsCompute.nmsTopK = 400
+//    nmsCompute.keepTopK = 200
+//    nmsCompute.nmsEta = 1.0
+//    nmsCompute.nmsThreshold = 0.45
+//    nmsCompute.background_label = 0;
+//
+//    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+//
+//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+//    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+//      fatalError( " result error " )
+//    }
+//
+//    let output: [Float32] = result.map { $0.floatValue }
+//
+//
+//    return output
+    fatalError()
+  }
+  
+
+  
+ 
+}
--- a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class MobileNet_ssd_AR: Net{
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [103.94, 116.78, 123.68]
+    scale = 1
+    except = 2
+    modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 160, w: 160, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [103.94, 116.78, 123.68]
+    scale = 1
+    except = 2
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 160, w: 160, c: 3)
+  }
+  
+  class MobilenetssdPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    return " \(res.result![0])"
+  }
+  
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    guard let interRes = paddleMobileRes.intermediateResults else {
+      fatalError(" need have inter result ")
+    }
+    
+    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
+      fatalError(" need score ")
+    }
+    
+    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
+      fatalError()
+    }
+    
+//    let startDate = Date.init()
+    
+//    print("scoreFormatArr: ")
+//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
+//
+//    print("bbox arr: ")
+//
+//    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
+    
+    let nmsCompute = NMSCompute.init()
+    nmsCompute.scoreThredshold = 0.25
+    nmsCompute.nmsTopK = 100
+    nmsCompute.keepTopK = 100
+    nmsCompute.nmsEta = 1.0
+    nmsCompute.nmsThreshold = 0.449999988
+    nmsCompute.background_label = 0;
+    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
+    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
+    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
+      fatalError( " result error " )
+    }
+    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+//    for i in 0..<Int(result.outputSize) {
+//
+//      print("i \(i) : \(result.output[i])")
+//    }
+//    print(Date.init().timeIntervalSince(startDate))
+
+//    print(resultHolder.result![0])
+    return resultHolder
+  }
+  
+  override func updateProgram(program: Program) {
+    for i in [56, 66, 76, 86, 93, 99] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
+      
+      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
+      
+      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
+      
+      program.scope[output] = originTexture
+      
+      if i == 99 {
+        opDesc.attrs["axis"] = 0
+      } else {
+        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+      }
+    }
+    
+    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      
+      
+      
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    }
+    
+    for i in [60, 101, 90, 97, 70, 80] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    }
+    
+    for i in [102] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      for output in opDesc.outputs["Out"]! {
+        let v = program.scope[output]!
+        let originTexture = v as! Texture<Float32>
+        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      }
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+      print(" split axis \(opDesc.attrs["axis"])")
+    }
+    // 99
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Net.swift
+++ b/metal/paddle-mobile/paddle-mobile/Net.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+import Foundation
+
+public class ResultHolder: NSObject {
+  @objc public let result: UnsafeMutablePointer<Float32>?
+  @objc public let capacity: Int
+
+  init(inResult: UnsafeMutablePointer<Float32>?, inCapacity: Int) {
+    result = inResult
+    capacity = inCapacity
+  }
+  
+  @objc public func releasePointer() {
+    result?.deinitialize(count: capacity)
+    result?.deallocate()
+  }
+}
+
+public class Net: NSObject {
+  var except: Int = 0
+  var means: [Float] = []
+  var scale: Float = 0.0
+  var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
+  var preprocessKernel: CusomKernel? = nil
+  var paramPointer: UnsafeMutableRawPointer? = nil
+  var paramSize: Int = 0
+  var modelPointer: UnsafeMutableRawPointer? = nil
+  var modelSize: Int = 0
+  var modelPath: String = ""
+  var paramPath: String = ""
+  var modelDir: String = ""
+  @objc public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+      self.paramPointer = paramPointer
+      self.paramSize = paramSize
+      self.modelPointer = modePointer
+      self.modelSize = modelSize
+      super.init()
+  }
+
+  
+  public func resultStr(res: ResultHolder) -> String {
+    fatalError()
+  }
+  
+  func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    return ResultHolder.init(inResult: paddleMobileRes.resultPointer, inCapacity: paddleMobileRes.capacity)
+  }
+  
+  @objc public init(device: MTLDevice) {
+    super.init()
+  }
+  
+  func updateProgram(program: Program) {
+
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -43,14 +43,31 @@ class OpCreator<P: PrecisionType> {
        [gConvType                  :     ConvOp<P>.creat,
         gBatchNormType             :     BatchNormOp<P>.creat,
         gReluType                  :     ReluOp<P>.creat,
-         gElementwiseAdd            :     ElementwiseAddOp<P>.creat,
+         gElementwiseAddType        :     ElementwiseAddOp<P>.creat,
         gFeedType                  :     FeedOp<P>.creat,
         gFetchType                 :     FetchOp<P>.creat,
         gConvAddBatchNormReluType  :     ConvAddBatchNormReluOp<P>.creat,
         gPooType                   :     PoolOp<P>.creat,
         gSoftmaxType               :     SoftmaxOp<P>.creat,
         gReshapeType               :     ReshapeOp<P>.creat,
-         gConvAddType               :     ConvAddOp<P>.creat]
-    
+         gConvAddType               :     ConvAddOp<P>.creat,
+         gDepthConvType             :     DepthConvOp<P>.creat,
+         gConcatType                :     ConcatOp<P>.creat,
+         gBoxcoderType              :     BoxcoderOp<P>.creat,
+         gConvBnReluType            :     ConvBNReluOp<P>.creat,
+         gDwConvBnReluType          :     DwConvBNReluOp<P>.creat,
+         gMulticlassNMSType         :     MulticlassNMSOp<P>.creat,
+         gTransposeType             :     TransposeOp<P>.creat,
+         gPriorBoxType              :     PriorBoxOp<P>.creat,
+         gPreluType                 :     PreluOp<P>.creat,
+         gConv2dTransposeType       :     ConvTransposeOp<P>.creat,
+         gBilinearInterpType        :     BilinearInterpOp<P>.creat,
+         gSplit                     :     SplitOp<P>.creat,
+         gShape                     :     ShapeOp<P>.creat,
+         gFlatten                   :     FlattenOp<P>.creat,
+         gConvAddPreluType          :     ConvAddPreluOp<P>.creat,
+         gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
+         gElementwiseAddPreluType:   ElementwiseAddPreluOp<P>.creat]
+  
    private init(){}
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
@@ -22,147 +22,199 @@ import Foundation
 */

 protocol OpParam {
-    associatedtype OutputType: Variant
-    var output: OutputType { get set }
-    func outputDesc() -> String
-    
-    associatedtype ParamPrecisionType: PrecisionType
-    init(opDesc: OpDesc, inScope: Scope) throws
-    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
-    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
-    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+  associatedtype OutputType: Variant
+  var output: OutputType { get set }
+  func outputDesc() -> String
+  
+  associatedtype ParamPrecisionType: PrecisionType
+  init(opDesc: OpDesc, inScope: Scope) throws
+  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
+  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
+  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+  
+  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
 }

 extension OpParam {
-    func outputDesc() -> String {
-        return output.debugDescription
+  func outputDesc() -> String {
+    return output.debugDescription
+  }
+  
+  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
+    guard let mapKeys = map[key], mapKeys.count > 0 else {
+      throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
    }
-    
-    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
-        guard let mapKeys = map[key], mapKeys.count > 0 else {
-            throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
-        }
-        guard let variant = from[mapKeys[0]], let v = variant as? VarType else {
-            throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
-        }
-        return v
+    guard let variant = from[mapKeys[0]] else {
+      throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
    }
    
-    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
-            
-            return tensorX
-        } catch let error {
-            throw error
-        }
+    guard let v = variant as? VarType else {
+      throw PaddleMobileError.paramError(message: " type error")
+
    }
-    
-    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
-            return tensorInput
-        } catch let error {
-            throw error
-        }
+    return v
+  }
+  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
+      return tensorVariances
+    } catch let error {
+      throw error
    }
-    
-    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
-            return tensorOutput
-        } catch let error {
-            throw error
-        }
-    }
-    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
-            return tensorOutputY
-        } catch let error {
-            throw error
-        }
-    }
-    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
-            return tensorY
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
+      return alphaTensor
+    } catch let error {
+      throw error
    }
-    
-    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
-            return out
-        } catch let error {
-            throw error
-        }
-    }
-    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
-            return tensorFilter
-        } catch let error {
-            throw error
-        }
+  }
+  
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
+      return tensorImage
+    } catch let error {
+      throw error
    }
-    
-    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
-            return tensorBias
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
+      return tensorX
+    } catch let error {
+      throw error
    }
-    
-    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
-            return tensorMean
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
+      return tensorBox
+    } catch let error {
+      throw error
    }
-    
-    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
-            return tensorScale
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
+      return tensorInput
+    } catch let error {
+      throw error
    }
-    
-    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
-            return tensorVariance
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
+      return tensorOutput
+    } catch let error {
+      throw error
+    }
+  }
+  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
+      return tensorOutputY
+    } catch let error {
+      throw error
+    }
+  }
+  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
+      return tensorY
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
+      return out
+    } catch let error {
+      throw error
+    }
+  }
+  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
+      return tensorFilter
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
+      return tensorBias
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
+      return tensorMean
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
+      return tensorScale
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
+      return tensorVariance
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
+    guard let attr = attrs[key] else {
+      throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
    }
    
-    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
-        guard let attr = attrs[key] else {
-            throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
-        }
-        
-        guard let tAttr = attr as? T else {
-            throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
-        }
-        return tAttr
+    guard let tAttr = attr as? T else {
+      throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
    }
+    return tAttr
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -16,100 +16,118 @@ import Metal
 import Foundation

 protocol Fusion {
-    static func fusionNode() -> Node
-    static func change() -> [String : [(from: String, to: String)]]
-    static func fusionType() -> String
+  static func fusionNode() -> Node
+  static func change() -> [String : [(from: String, to: String)]]
+  static func fusionType() -> String
+  static func needCheck() -> [(Int, String)]
+}
+extension Fusion {
+  static func needCheck() -> [(Int, String)] {
+    return []
+  }
 }

 protocol Runable {
-    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
-    func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
-    func delogOutput()
+  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
+  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
+  func delogOutput()
+  func inputVariant() -> [String : [Variant]]
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
 }

 extension Runable where Self: OperatorProtocol{
-    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try runImpl(device: device, buffer: buffer)
-        } catch let error {
-            throw error
-        }
-//        print(type + ": " + para.outputDesc())
+  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try runImpl(device: device, buffer: buffer)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func inputVariant() -> [String : [Variant]] {
+//    return [:]
+    fatalError(" op \(type) need implement inputVariant")
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    fatalError(" need implement ")
+  }
+  
+  func delogOutput() {
    
-    func delogOutput() {
-        print(type + ": has no implementation" )
-    }
+    print(type + ": has no implementation" )
+  }
 }

 protocol Creator where Self: OperatorProtocol{
-    associatedtype OpType: OperatorProtocol & Runable & InferShaperable
-    static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType
+  associatedtype OpType: OperatorProtocol & Runable & InferShaperable
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType
 }

 extension Creator where Self: OperatorProtocol {
-    static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType {
-        do {
-            return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType {
+    do {
+      return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
    }
+  }
 }

 protocol InferShaperable {
-    func inferShape()
+  func inferShape()
 }

 protocol OperatorProtocol {
-    associatedtype ParamType
-    associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
-    var type: String { get }
-    var scope: Scope { get }
-    var inputs: [String : [String]] { get }
-    var paraInputs: [String : [String]] { get set }
-    var outpus: [String : [String]] { get }
-    var attrs: [String : Attr] { get }
-    var para: ParamType { get }
-    var kernel: KerType { get }
-    init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws
+  associatedtype ParamType
+  associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
+  var type: String { get }
+  var scope: Scope { get }
+  var inputs: [String : [String]] { get }
+  var paraInputs: [String : [String]] { get set }
+  var outpus: [String : [String]] { get }
+  var attrs: [String : Attr] { get }
+  var para: ParamType { get }
+  var kernel: KerType { get }
+  init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws
 }

 extension OperatorProtocol {
-    static func provide(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> Self {
-        do {
-            return try Self.init(device: device, opDesc: opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
+  static func provide(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> Self {
+    do {
+      return try Self.init(device: device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
    }
+  }
 }

 class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where KernelType.ParamType == ParameterType {
-    typealias ParamType = ParameterType
-    typealias KerType = KernelType
-    let type: String
-    let inputs: [String : [String]]
-    var paraInputs: [String : [String]]
-    let outpus: [String : [String]]
-    let attrs: [String : Attr]
-    let para: ParamType
-    let scope: Scope
-    var kernel: KerType
-    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
-        type = opDesc.type
-        scope = inScope
-        inputs = opDesc.inputs
-        outpus = opDesc.outputs
-        attrs =  opDesc.attrs
-        paraInputs = opDesc.paraInputs
-        do {
-            para = try ParamType.init(opDesc:opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
-        kernel = KernelType.init(device: device, param: para)
+  typealias ParamType = ParameterType
+  typealias KerType = KernelType
+  let type: String
+  let inputs: [String : [String]]
+  var paraInputs: [String : [String]]
+  let outpus: [String : [String]]
+  let attrs: [String : Attr]
+  let para: ParamType
+  let scope: Scope
+  var kernel: KerType
+  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+//    print("create op: \(opDesc.type)")
+    type = opDesc.type
+    scope = inScope
+    inputs = opDesc.inputs
+    outpus = opDesc.outputs
+    attrs =  opDesc.attrs
+    paraInputs = opDesc.paraInputs
+    do {
+      para = try ParamType.init(opDesc:opDesc, inScope: inScope)
+    } catch let error {
+      throw error
    }
+    kernel = KernelType.init(device: device, param: para)
+  }
 }

 // op infos
@@ -118,22 +136,57 @@ let gFeedType                   = "feed"
 let gConvType                   = "conv2d"
 let gBatchNormType              = "batch_norm"
 let gReluType                   = "relu"
-let gElementwiseAdd             = "elementwise_add"
+let gElementwiseAddType         = "elementwise_add"
 let gConvAddBatchNormReluType   = "conv_add_batchnorm_relu"
 let gPooType                    = "pool2d"
 let gSoftmaxType                = "softmax"
 let gReshapeType                = "reshape"
 let gConvAddType                = "conv_add"
+let gDepthConvType              = "depthwise_conv2d"
+let gPriorBoxType               = "prior_box"
+let gTransposeType              = "transpose"
+let gConcatType                 = "concat"
+let gBoxcoderType               = "box_coder"
+let gMulticlassNMSType          = "multiclass_nms"
+let gConvBnReluType             = "conv_bn_relu"
+let gDwConvBnReluType           = "depth_conv_bn_relu"
+let gPreluType                  = "prelu"
+let gConv2dTransposeType        = "conv2d_transpose"
+let gBilinearInterpType         = "bilinear_interp"
+let gSplit                      = "split"
+let gShape                      = "shape"
+let gFlatten                    = "flatten"
+let gConvAddPreluType           = "conv_add_prelu"
+let gConvAddAddPreluType        = "conv_add_add_prelu"
+let gElementwiseAddPreluType = "elementwise_add_prelu"


 let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
               gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
               gReluType                    : (inputs: ["X"], outputs: ["Out"]),
-               gElementwiseAdd              : (inputs: ["X"], outputs: ["Out"]),
+               gElementwiseAddType          : (inputs: ["X"], outputs: ["Out"]),
               gFeedType                    : (inputs: ["X"], outputs: ["Out"]),
               gFetchType                   : (inputs: ["X"], outputs: ["Out"]),
               gConvAddBatchNormReluType    : (inputs: ["Input"], outputs: ["Out"]),
               gPooType                     : (inputs: ["X"], outputs: ["Out"]),
               gSoftmaxType                 : (inputs: ["X"], outputs: ["Out"]),
               gReshapeType                 : (inputs: ["X"], outputs: ["Out"]),
-               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"])]
+               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"]),
+               gDepthConvType               : (inputs: ["Input"], outputs: ["Output"]),
+               gConcatType                  : (inputs: ["X"], outputs: ["Out"]),
+               gBoxcoderType                : (inputs: ["PriorBox", "PriorBoxVar", "TargetBox"], outputs: ["OutputBox"]),
+               gTransposeType               : (inputs: ["X"], outputs: ["Out"]),
+               gConvBnReluType              : (inputs: ["Input"], outputs: ["Out"]),
+               gDwConvBnReluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gMulticlassNMSType           : (inputs: ["BBoxes", "Scores"], outputs: ["Out"]),
+               gPriorBoxType                : (inputs: ["Input", "Image"], outputs: ["Boxes", "Variances"]),
+               gPreluType                   : (inputs: ["X"], outputs: ["Out"]),
+               gConv2dTransposeType         : (inputs: ["Input"], outputs: ["Output"]),
+               gBilinearInterpType          : (inputs: ["X"], outputs: ["Out"]),
+               gSplit                       : (inputs: ["X"], outputs: ["Out"]),
+               gShape                       : (inputs: ["Input"], outputs: ["Out"]),
+               gFlatten                     : (inputs: ["X"], outputs: ["Out"]),
+               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gConvAddAddPreluType         : (inputs: ["Input"], outputs: ["Out"]),
+               gElementwiseAddPreluType  :  (inputs: ["X"], outputs: ["Out"])
+              ]
--- a/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-// http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */

 import Foundation

 class BatchNormParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
-            inputBias = try BatchNormParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-            inputMean = try BatchNormParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-            inputScale = try BatchNormParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-            inputVariance = try BatchNormParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-            epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-            momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
-            is_test = try BatchNormParam.getAttr(key: "is_test", attrs: opDesc.attrs)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
+      if input.transpose != [0, 2, 3, 1] {
+        fatalError("batch norm only accepts NHWC")
+      }
+      output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
+      bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
+      mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
+      scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
+      variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
+      epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
    }
-    let input: Texture<P>
-    var output: Texture<P>
-    let inputBias: Tensor<ParamPrecisionType>
-    let inputMean: Tensor<ParamPrecisionType>
-    let inputScale: Tensor<ParamPrecisionType>
-    let inputVariance: Tensor<ParamPrecisionType>
-    let epsilon: Float
-    let momentum: Float
-    let is_test: Bool
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let bias: Tensor<P>
+  let mean: Tensor<P>
+  let scale: Tensor<P>
+  let variance: Tensor<P>
+  let epsilon: Float
+  let momentum: Float
 }

 class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam<P>>, Runable, Creator, InferShaperable{
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    typealias OpType = BatchNormOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+  typealias OpType = BatchNormOp<P>
+
+  func inferShape() {
+    para.output.dim = para.input.dim
+  }
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
 }
-
-
-
-
-
--- a/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BilinearInterpParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+      out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+      fatalError()
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let out_h: Int
+  let out_w: Int
+}
+
+class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, BilinearInterpParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = BilinearInterpOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+//    print(outputArray)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BoxcoderParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
+      priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
+      targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
+      output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
+      codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
+      boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    assert(priorBox.tensorDim.cout() == 2)
+    assert(priorBoxVar.tensorDim.cout() == 2)
+    assert(targetBox.tensorDim.cout() == 3)
+    assert(output.tensorDim.cout() == 3)
+    assert(priorBox.transpose == [0, 1, 2, 3])
+    assert(priorBoxVar.transpose == [0, 1, 2, 3])
+    assert(targetBox.transpose == [0, 1, 2, 3])
+    assert(codeType == "decode_center_size") // encode_center_size is not implemented
+    assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
+  }
+  let priorBox: Texture<P>
+  let priorBoxVar: Texture<P>
+  let targetBox: Texture<P>
+  var output: Texture<P>
+  let codeType: String
+  let boxNormalized: Bool
+}
+
+class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = BoxcoderOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
+    let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
+    let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
+    let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(" prior box var ")
+    print(pbv.strideArray())
+    print(" target box ")
+    print(tb.strideArray())
+    print(" prior box ")
+    print(pb.strideArray())
+    print(" output ")
+    print(out.strideArray())
+  }
+  
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
+//
+//  CNNConvAddBatchNormReluOp.swift
+//  paddle-mobile
+
+import Foundation
+
+class CNNMPSConvTestParam: TestParam {
+    var outputTexture: MTLTexture?
+    var metalParam: MetalConvParam
+    let filterPointer: UnsafeMutableRawPointer
+    let biasePointer: UnsafeMutablePointer<Float>
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inMetalParam: MetalConvParam, inFilter: [Float], inBiase: [Float], inFilterSize: (width: Int, height: Int, channel: Int)) {
+        metalParam = inMetalParam
+        filterPointer = UnsafeMutableRawPointer.init(mutating: inFilter)
+        biasePointer = UnsafeMutablePointer.init(mutating: inBiase)
+        filterSize = inFilterSize
+    }
+}
+
+@available(iOS 10.0, *)
+class CNNMPSConvOp<P: PrecisionType>: Operator<CNNConvKernel<P>, CNNConvParam<P>>, Runable, Creator, InferShaperable, Fusion {
+    
+    typealias OpType = CNNMPSConvOp<P>
+
+    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+        fatalError()
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+    }
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode-->Node.init(inType: gElementwiseAdd);
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gMPSCNNConvType
+    }
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConcatParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      guard let xlist = opDesc.inputs["X"] else {
+        fatalError()
+      }
+      for x in xlist {
+        guard let variant = inScope[x], let v = variant as? Texture<P> else {
+          fatalError()
+        }
+        if transpose.count == 0 {
+          transpose = v.transpose
+        }
+        if v.transpose != transpose {
+          fatalError()
+        }
+       
+        input.append(v)
+      }
+      axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+      output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  var input: [Texture<P>] = []
+  var output: Texture<P>
+  var transpose: [Int] = []
+  let axis: Int
+}
+
+class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ConcatOp<P>
+
+  func inferShape() {
+    //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
+    //        para.output.dim = Dim.init(inDim: dim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddAddPreluOp<P: PrecisionType>: Operator<ConvAddAddPreluKernel<P>, ConvAddAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddAddPreluType
+  }
+  
+  static func needCheck() -> [(Int, String)] {
+    return [(2, "Y"), (2, "X")]
+  }
+  
+  
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -14,119 +14,117 @@

 import Foundation

+
 class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-            
-            groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-            bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-            scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-            mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-            y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      
+      filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      
+      groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+      bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+      
+      scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+      mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+      y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
    }
-    
-    let input: Texture<P>
-    
-    let variance: Tensor<ParamPrecisionType>
-    let bias: Tensor<ParamPrecisionType>
-    let mean: Tensor<ParamPrecisionType>
-    let scale: Tensor<ParamPrecisionType>
-    let y: Tensor<ParamPrecisionType>
-    let filter: Tensor<ParamPrecisionType>
-    let epsilon: Float32
-    var newScale: MTLBuffer?
-    var newBiase: MTLBuffer?
-    
-    var output: Texture<P>
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
+  }
+  
+  let input: Texture<P>
+  
+  let variance: Tensor<ParamPrecisionType>
+  let bias: Tensor<ParamPrecisionType>
+  let mean: Tensor<ParamPrecisionType>
+  let scale: Tensor<ParamPrecisionType>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let epsilon: Float32
+  var newScale: MTLBuffer?
+  var newBiase: MTLBuffer?
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
 }

 class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKernel<P>, ConvAddBatchNormReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    typealias OpType = ConvAddBatchNormReluOp<P>
+  
+  typealias OpType = ConvAddBatchNormReluOp<P>
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
    }
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAdd)
-            --> Node.init(inType: gBatchNormType)
-            --> Node.init(inType: gReluType)
-        return beginNode
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType)
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddBatchNormReluType
+  }
+  
+  func delogOutput() {
+    print(" conv add batchnorm relu output ")
+    print(para.output.toTensor().strideArray())
+    //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+    //        para.filter.logDataPointer(header: "filter data pointer: ")
+    //        print("filter: \(para.filter)")
    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
+    //        print("biase: \(para.y)")
+    //        print("padding: \(para.paddings)")
+    //        print("stride: \(para.stride)")
    
-    static func fusionType() -> String {
-        return gConvAddBatchNormReluType
-    }
+    //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+    //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+    //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
    
-    func delogOutput() {
-        
-//        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
-//        para.filter.logDataPointer(header: "filter data pointer: ")
-//        print("filter: \(para.filter)")
-        
-//        print("biase: \(para.y)")
-//        print("padding: \(para.paddings)")
-//        print("stride: \(para.stride)")
-        
-//        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
-//        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
-//        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
-        
-        let output = para.output.metalTexture.floatArray { (p: P) -> P in
-            return p
-        }
-//
-        writeToLibrary(fileName: "output_112x112x32_2", array: output)
-        print(" write done")
-        
-//        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
-    }
+    //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -15,79 +15,102 @@
 import Foundation

 class ConvAddParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      
+      y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
    }
-    
-    let input: Texture<P>
-    let y: Tensor<ParamPrecisionType>
-    let filter: Tensor<ParamPrecisionType>
-    
-    var output: Texture<P>
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
 }

 class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAdd)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvAddType
-    }
+  typealias OpType = ConvAddOp<P>
+
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddType
+  }
+  
+  func inferShape() {
    
-    typealias OpType = ConvAddOp<P>
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
    
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func delogOutput() {
+//    print("op \(type): ")
+//    print(" padding: ")
+//    print(para.paddings)
+//    print("stride: ")
+//    print(para.stride)
+//    print("dilations: ")
+//    print(para.dilations)
+//    print(" para input dim: ")
+//    print(para.input.dim)
+//    print(" para filter dim: ")
+//    print(para.filter.dim)
+//    print(" para output dim: ")
+//    print(para.output.dim)
+//    print(" biase: ")
+//    let biase: [Float32] = para.y.buffer.array()
+//    print(biase)
    
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddPreluType
+  }
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvBNReluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      
+      groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+      bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+      scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+      mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  
+  let variance: Tensor<ParamPrecisionType>
+  let bias: Tensor<ParamPrecisionType>
+  let mean: Tensor<ParamPrecisionType>
+  let scale: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let epsilon: Float32
+  var newScale: MTLBuffer?
+  var newBiase: MTLBuffer?
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvBNReluOp<P>
+  
+  func inputs() -> [Variant] {
+    return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
+  }
+  
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvBnReluType
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
@@ -15,74 +15,67 @@
 import Foundation

 class ConvParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      
+    } catch let error {
+      throw error
    }
-    
-    let input: Texture<P>
-    let filter: Tensor<ParamPrecisionType>
-    var output: Texture<P>
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
+  }
+  
+  let input: Texture<P>
+  let filter: Tensor<ParamPrecisionType>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
 }

 class ConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            try super.init(device: device, opDesc: opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
-        
-    }
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
+  typealias OpType = ConvOp<P>
+
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
    
-    typealias OpType = ConvOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
    }
-    
-    func delogOutput() {
-        print("conv output : ")
-        print(para.output.metalTexture)
-//        let _: Float16? = para.output.metalTexture.logDesc()
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func delogOutput() {
+    print("conv output : ")
+    print(para.output.toTensor().strideArray())
+    //        let _: Float16? = para.output.metalTexture.logDesc()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvTransposeParam<P: PrecisionType>: ConvParam<P> {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      try super.init(opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+}
+
+class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTransposeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ConvTransposeOp<P>
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+  
+    print(" \(type) output: ")
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
+      print(output.strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
+
+  typealias OpType = DepthConvOp<P>
+
+  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      try super.init(device: device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvBNReluOp<P>
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gDepthConvType)
+    _ = beginNode
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gDwConvBnReluType
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -15,33 +15,80 @@
 import Foundation

 class ElementwiseAddParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
-            inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-            
-            output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
    }
-    let input: Texture<P>
-    let inputY: Tensor<P>
-    var output: Texture<P>
-    let axis: Int
+    do {
+      inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch _ {
+      let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+      let device = inputX.metalTexture!.device
+      inputY = Texture.init(device: device, inDim: tensorY.dim)
+      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+    }
+    
+//    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+//      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+//      if computePrecision == .Float32 {
+//        super.init(device: device, inFunctionName: "elementwise_add")
+//      } else if computePrecision == .Float16 {
+//        super.init(device: device, inFunctionName: "elementwise_add_half")
+//      } else {
+//        fatalError()
+//      }
+//    }
+    
+    var offset = axis
+    if axis == -1 {
+      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    }
+    for i in 0..<(inputY.tensorDim.cout()) {
+      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    }
+  }
+  
+  var inputX: Texture<P>
+  var inputY: Texture<P>
+  var output: Texture<P>
+  var axis: Int
 }

 class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, ElementwiseAddParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
+  typealias OpType = ElementwiseAddOp<P>
+  
+  func inferShape() {
+//    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output)
    
-    typealias OpType = ElementwiseAddOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
    }
+  }
 }



--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    do {
+      inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch _ {
+      let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+      let device = inputX.metalTexture!.device
+      inputY = Texture.init(device: device, inDim: tensorY.dim)
+      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+    }
+    
+    //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+    //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    //      if computePrecision == .Float32 {
+    //        super.init(device: device, inFunctionName: "elementwise_add")
+    //      } else if computePrecision == .Float16 {
+    //        super.init(device: device, inFunctionName: "elementwise_add_half")
+    //      } else {
+    //        fatalError()
+    //      }
+    //    }
+    
+    var offset = axis
+    if axis == -1 {
+      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    }
+    for i in 0..<(inputY.tensorDim.cout()) {
+      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    }
+  }
+  
+  let mode: String
+  let alpha: Tensor<P>
+  var inputX: Texture<P>
+  var inputY: Texture<P>
+  var output: Texture<P>
+  var axis: Int
+}
+
+class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gElementwiseAddType)
+    _ = beginNode
+      --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gElementwiseAddPreluType
+  }
+  
+  typealias OpType = ElementwiseAddPreluOp<P>
+  
+  func inferShape() {
+    //    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output)
+    
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
@@ -15,54 +15,53 @@
 import Foundation

 class FeedParam<P: PrecisionType>: OpParam{
-    var output: Texture<P>
-    var input: InputTexture {
-        return scope.input() as! InputTexture
+  var output: Texture<P>
+  var input: InputTexture {
+    return scope.input() as! InputTexture
+  }
+  let scope: Scope
+  
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    scope = inScope
+    do {
+      output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
    }
-    let scope: Scope
-    
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        scope = inScope
-        do {
-            output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
-        }
-    }
-    
-    typealias ParamPrecisionType = P
+  }
+  
+  typealias ParamPrecisionType = P
 }

 class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = FeedOp<P>
-    
-    func inferShape() {
-        //        print("feed  input: \(para.input.expectDim)")
-        print("feed output: \(para.output.dim)")
-        //        para.output.dim =
-        //        para.output.dim = para.input.expectDim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-        
-//        let resizeKernel = ResizeKernel<P>.init(device: device)
-//        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
-//        do {
-//            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
-//        } catch let error {
-//            throw error
-//        }
+  typealias OpType = FeedOp<P>
+
+  func inferShape() {
+    //        print("feed  input: \(para.input.expectDim)")
+    print("feed output: \(para.output.dim)")
+    //        para.output.dim =
+    //        para.output.dim = para.input.expectDim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
    
-    func delogOutput() {
-//        para.input.mtlTexture.logDesc()
-//        let _: P? = para.input.mtlTexture.logDesc(header: "feed input: ", stridable: true)
-//        let _: P? = para.output.metalTexture.logDesc(header: "feed output: ", stridable: false)
-    }
+    //        let resizeKernel = ResizeKernel<P>.init(device: device)
+    //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
+    //        do {
+    //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
+    //        } catch let error {
+    //            throw error
+    //        }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
@@ -15,40 +15,73 @@
 import Foundation

 class FetchParam<P: PrecisionType>: OpParam{
-    var output: Texture<P>
-    let input: Texture<P>
-    let scope: Scope
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        scope = inScope
-        do {
-            input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = input
-        } catch let error {
-            throw error
-        }
+  var output: FetchHolder
+  let input: Texture<P>
+  let scope: Scope
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    scope = inScope
+    do {
+      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = FetchHolder.init(inCapacity: input.numel(), inDim: input.tensorDim.dims)
+      scope.setOutput(output: output)
+    } catch let error {
+      throw error
    }
-    
-    typealias ParamPrecisionType = P
+  }
+  
+  typealias ParamPrecisionType = P
 }

 class FetchKernel<P: PrecisionType>: Kernel, Computable {
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
    }
-    
-    required init(device: MTLDevice, param: FetchParam<P>) {
-        super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: FetchParam<P>) {
+    param.output.initBuffer(device: device)
+    if computePrecision == .Float16 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch_half")
+      } else {
+//        fatalError(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder_half")
+        print(" not support ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch")
+      } else {
+        print(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder")
+//        fatalError(" not support ")        
+      }
+    } else {
+      fatalError(" not support ")
    }
+  }
 }

-class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable{
-    func inferShape() {
-        print(para.input.dim)
-    }
-    
-    typealias OpType = FetchOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        scope.setOutput(output: para.output)
+class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
+  
+  typealias OpType = FetchOp<P>
+
+  func inferShape() {
+    print(para.input.dim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class FlattenParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let axis: Int
+}
+
+
+class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = FlattenOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import Foundation
+
+public protocol TestParam {
+}
+
+public protocol Testable {
+  associatedtype TestParamType: TestParam
+  func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
+  init(device: MTLDevice, testParam: TestParamType)
+}
+
+
+protocol Computable {
+  associatedtype ParamType: OpParam
+  func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
+  init(device: MTLDevice, param: ParamType)
+}
+
+protocol KernelProtocol {
+  var pipline: MTLComputePipelineState { get set }
+  var functionName: String { get set }
+  
+}
+
+open class Kernel {
+  let pipline: MTLComputePipelineState
+  let functionName: String
+  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
+    pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
+    functionName = inFunctionName
+  }
+}
+
+open class CusomKernel: Kernel {
+  public struct Shape {
+    public let width: Int
+    public let height: Int
+    public let channel: Int
+    public init(inWidth: Int, inHeight: Int, inChannel: Int){
+      width = inWidth
+      height = inHeight
+      channel = inChannel
+    }
+  }
+  public let outputTexture: MTLTexture
+  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.textureType = .type2D
+    textureDesc.width = outputDim.width
+    textureDesc.height = outputDim.height
+    textureDesc.depth = (outputDim.channel + 3) / 4
+    
+    if computePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if computePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    } else {
+      fatalError()
+    }
+    
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.storageMode = .shared
+    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+    
+    super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+  }
+  
+  public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(inputTexuture, index: 0)
+    encoder.setTexture(outputTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+    encoder.endEncoding()
+  }
+  
+}
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
@@ -15,53 +15,39 @@
 import Foundation

 class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
-    var newScale: MTLBuffer
-    var newBias: MTLBuffer
-    
-    required init(device: MTLDevice, param: BatchNormParam<P>) {
-        guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
-            fatalError()
-        }
-        guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
-            fatalError()
-        }
-        self.newScale = newScale
-        self.newBias = newBias
-        
-        super.init(device: device, inFunctionName: "batchnorm")
-        
-        let varianceBuffer : MTLBuffer = param.inputVariance.buffer
-        
-        var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
-        let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
-            invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
-        }
-        
-        let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
-        let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
-        let scale : MTLBuffer = param.inputScale.buffer
-        let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
-        let bias : MTLBuffer = param.inputBias.buffer
-        let biasContents = bias.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
-            newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
-            newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
-        }
+  required init(device: MTLDevice, param: BatchNormParam<P>) {
+    let count = param.variance.dim.numel()
+    let varianceP = param.variance.data.pointer
+    let meanP = param.mean.data.pointer
+    let scaleP = param.scale.data.pointer
+    let biasP = param.bias.data.pointer
+    for i in 0..<count {
+      let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
+      biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
+      scaleP[i] = invStd * scaleP[i]
    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        print("BatchNorm compute")
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBuffer(newScale, offset: 0, index: 0)
-        encoder.setBuffer(newBias, offset: 0, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+
+    param.bias.initBuffer(device: device, precision: computePrecision)
+    param.scale.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "batchnorm")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "batchnorm_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
+    encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
+//
+//  BatchNormRelu.swift
+//  paddle-mobile
+//
+//  Created by zhangxinjun on 2018/8/23.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+import Foundation
+
+
+class BatchNormReluParam<P: PrecisionType>: BatchNormParam<P> {
+    
+}
+
+class BatchNormReluKernel<P: PrecisionType>: Kernel, Computable{
+    
+    
+    typealias ParamType = BatchNormReluParam<P>
+    var newScale: MTLBuffer
+    var newBias: MTLBuffer
+    
+    required init(device: MTLDevice, testParam: BatchNormReluTestParam) {
+        
+        newScale = testParam.newScaleBuffer
+        newBias = testParam.newBiaseBuffer
+        
+        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
+    }
+    
+    required init(device: MTLDevice, param: BatchNormReluParam<P>) {
+        guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
+            fatalError()
+        }
+        guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
+            fatalError()
+        }
+        self.newScale = newScale
+        self.newBias = newBias
+        
+        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
+        
+        
+        let varianceBuffer : MTLBuffer = param.inputVariance.buffer
+        
+        var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
+        let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
+            invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
+        }
+        
+        let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
+        let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
+        let scale : MTLBuffer = param.inputScale.buffer
+        let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
+        let bias : MTLBuffer = param.inputBias.buffer
+        let biasContents = bias.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
+            newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
+            newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
+        }
+    }
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        encoder.setTexture(param.input as? MTLTexture, index: 0)
+        encoder.setTexture(param.output as? MTLTexture, index: 1)
+        encoder.setBuffer(newScale, offset: 0, index: 1)
+        encoder.setBuffer(newBias, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output as! MTLTexture)
+        encoder.endEncoding()
+    }
+    
+    func testCompute(commandBuffer: MTLCommandBuffer, testParam: BatchNormReluTestParam) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        encoder.setTexture(testParam.inputTexture, index: 0)
+        encoder.setTexture(testParam.outputTexture, index: 1)
+        encoder.setBuffer(newScale, offset: 0, index: 0)
+        encoder.setBuffer(newBias, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+        encoder.endEncoding()
+    }
+    
+    
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct BilinearInterpMetalParam {
+  var ratio_h: Float32
+  var ratio_w: Float32
+}
+
+class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    var ratio_h: Float32 = 0
+    var ratio_w: Float32 = 0
+    if param.output.tensorDim.dims[2] > 1 {
+      ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+    }
+    if param.output.tensorDim.dims[3] > 1 {
+      ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
+    }
+    var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+    encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: BilinearInterpParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "bilinear_interp_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "bilinear_interp_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct BoxcoderMetalParam {
+}
+
+class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.priorBox.metalTexture, index: 0)
+    encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
+    encoder.setTexture(param.targetBox.metalTexture, index: 2)
+    encoder.setTexture(param.output.metalTexture, index: 3)
+    var bmp = BoxcoderMetalParam.init()
+    encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: BoxcoderParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "boxcoder_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "boxcoder_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
+//
+//  CNNConvKernel.swift
+//  paddle-mobile
+//
+
+import Foundation
+import Metal
+import Accelerate
+import MetalPerformanceShaders
+
+@available(iOS 10.0, *)
+class WeightsDataSource: NSObject, MPSCNNConvolutionDataSource  {
+    
+    let desc: MPSCNNConvolutionDescriptor
+    let weight:UnsafeMutableRawPointer
+    let bias:UnsafeMutablePointer<Float>
+    
+    
+    
+    init(inDesc: MPSCNNConvolutionDescriptor, inWeight: UnsafeMutableRawPointer, inBias: UnsafeMutablePointer<Float>) {
+        desc = inDesc
+        weight = inWeight
+        bias = inBias
+    }
+    
+    
+    func dataType() -> MPSDataType {
+        return .float32
+    }
+    
+    func descriptor() -> MPSCNNConvolutionDescriptor {
+        return desc
+    }
+    
+    func weights() -> UnsafeMutableRawPointer {
+        return self.weight
+    }
+    
+    func biasTerms() -> UnsafeMutablePointer<Float>? {
+        return self.bias
+    }
+    
+    func load() -> Bool {
+        return true
+    }
+    
+    func purge() {
+    }
+    
+    func label() -> String? {
+        return "Conv"
+    }
+    
+    
+}
+
+@available(iOS 10.0, *)
+class CNNConvParam<P: PrecisionType>: OpParam{
+    
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            filter = try CNNConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try CNNConvParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try CNNConvParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try CNNConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try CNNConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            // 暂时不用关心
+            dilations = try CNNConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            // 暂时不用关心
+            groups = try CNNConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            
+            variance = try CNNConvParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            // bias
+            y = try CNNConvParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    var input: Texture<P>
+    let variance: Tensor<ParamPrecisionType>
+    let y: Tensor<ParamPrecisionType>
+    let filter: Tensor<ParamPrecisionType>
+    var output: Texture<P>
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
+}
+
+@available(iOS 10.0, *)
+class CNNConvKernel<P: PrecisionType>: Kernel, Computable {
+    
+    typealias ParamType = CNNConvParam<P>
+    
+    var mpsImageCreator: MpsImageCreator<P>?
+    var activation:MPSCNNNeuron?
+    var conv:MPSCNNConvolution?
+    var weightDataSource:WeightsDataSource?
+    var param: CNNConvParam<P>?
+    var device: MTLDevice?
+    
+    
+    required init(device:MTLDevice, testParam:CNNMPSConvTestParam) {
+        self.device = device
+        
+        let desc = MPSCNNConvolutionDescriptor(kernelWidth: testParam.filterSize.width, kernelHeight: testParam.filterSize.height, inputFeatureChannels: testParam.filterSize.channel, outputFeatureChannels: testParam.filterSize.channel, neuronFilter: activation)
+        
+        desc.strideInPixelsX = Int(testParam.metalParam.offsetX)
+        desc.strideInPixelsY = Int(testParam.metalParam.offsetY)
+        
+        
+        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:testParam.filterPointer, inBias:testParam.biasePointer)
+        
+        if #available(iOS 11.0, *) {
+            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
+        } else {
+            // Fallback on earlier versions
+        }
+        
+        super.init(device: device, inFunctionName: "")
+    }
+
+    required init(device:MTLDevice, param:CNNConvParam<P>) {
+        
+        self.device = device
+
+        let inChannels: Int
+        let outChannels: Int
+        
+        if param.y.dim.cout() == 4 {
+            inChannels = (param.y.dim[3])
+            outChannels = inChannels
+        } else {
+            inChannels = 0
+            outChannels = inChannels
+        }
+        
+        let desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.width, kernelHeight: param.filter.height, inputFeatureChannels: inChannels, outputFeatureChannels: outChannels, neuronFilter: activation)
+        
+        desc.strideInPixelsX = Int(param.stride[0])
+        desc.strideInPixelsY = Int(param.stride[1])
+        
+        
+        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:param.filter.data.pointer as! UnsafeMutablePointer<Float>, inBias: param.y.data.pointer as! UnsafeMutablePointer<Float>)
+        
+        if #available(iOS 11.0, *) {
+            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
+        } else {
+            // Fallback on earlier versions
+        }
+        
+        super.init(device: device, inFunctionName: "")
+    }
+
+    func compute(commandBuffer: MTLCommandBuffer, param: CNNConvParam<P>) throws {
+        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        
+        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
+        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+        
+        param.input = outputImage.texture as! Texture<P>
+    }
+    
+    func testCompute(commandBuffer: MTLCommandBuffer, testParam: CNNMPSConvTestParam) throws {
+        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        
+        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
+        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+        
+        testParam.outputTexture = outputImage.texture
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: ConcatParam<P>) {
+        super.init(device: device, inFunctionName: "concat")
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ConcatTestParam: TestParam {
+  var input: [MTLTexture]
+  var output: MTLTexture
+  var dims: [[Int]]
+  var axis: Int
+  var odim: [Int]
+}
+
+struct ConcatMetalParam {
+  var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+  var axis: Int32 = 0
+  var offset: Int32 = 0
+  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
+}
+
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+  var v = "normal"
+  var pm = ConcatMetalParam.init()
+  func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+    
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    let num = param.input.count
+    for i in 0..<num {
+      encoder.setTexture(param.input[i].metalTexture, index: i)
+    }
+    encoder.setTexture(param.output.metalTexture, index: num)
+    if v == "normal" {
+      encoder.setTexture(param.output.metalTexture, index: num + 1)
+    }
+    encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+
+  required init(device: MTLDevice, param: ConcatParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
+    let orank = param.output.tensorDim.cout()
+    let num = param.input.count
+    assert(num <= 6)
+    var axis = 4 - param.output.tensorDim.cout() + param.axis
+    for i in 0..<4 {
+      if param.transpose[i] == axis {
+        axis = i
+        break
+      }
+    }
+    pm.axis = Int32(axis)
+    pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
+    pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
+    var vdim: [Int] = [0, 0, 0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = param.input[i].dim[axis]
+    }
+    if orank == 4 {
+      if axis == 1 {
+        v = "y"
+      } else if axis == 2 {
+        v = "x"
+      } else {
+        if (param.output.dim[0] == 1) && axis == 3 {
+          var vz = true
+          for i in 0..<num {
+            if vdim[i] % 4 != 0 {
+              vz = false
+              break
+            }
+          }
+          if vz {
+            v = "z"
+            for i in 0..<num {
+              vdim[i] = vdim[i] / 4
+            }
+          }
+        }
+      }
+    } else if orank == 3 {
+      if axis == 2 {
+        v = "y"
+      } else if axis == 3 {
+        v = "x"
+      } else if axis == 1 {
+        var vz = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vz = false
+            break
+          }
+        }
+        if vz {
+          v = "z"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
+          }
+        }
+      }
+    } else {
+      if axis == 2 {
+        v = "y"
+      } else if axis == 3 {
+        var vx = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vx = false
+            break
+          }
+        }
+        if vx {
+          v = "x"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
+          }
+        }
+      }
+    }
+    pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  required init(device: MTLDevice, testParam: ConcatTestParam) {
+    super.init(device: device, inFunctionName: "concat")
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -15,124 +15,165 @@
 import Foundation

 struct ConvAddBatchNormReluTestParam: TestParam {
-    let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
-    var metalParam: MetalConvParam
-    let filterBuffer: MTLBuffer
-    let biaseBuffer: MTLBuffer
-    let newScaleBuffer: MTLBuffer
-    let newBiaseBuffer: MTLBuffer
-    let filterSize: (width: Int, height: Int, channel: Int)
-    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-        inputTexture = inInputTexture
-        outputTexture = inOutputTexture
-        metalParam = inMetalParam
-        filterBuffer = inFilterBuffer
-        biaseBuffer = inBiaseBuffer
-        newScaleBuffer = inNewScaleBuffer
-        newBiaseBuffer = inNewBiaseBuffer
-        filterSize = inFilterSize
-    }
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  var metalParam: MetalConvParam
+  let filterBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
+  let filterSize: (width: Int, height: Int, channel: Int)
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+    inputTexture = inInputTexture
+    outputTexture = inOutputTexture
+    metalParam = inMetalParam
+    filterBuffer = inFilterBuffer
+    biaseBuffer = inBiaseBuffer
+    newScaleBuffer = inNewScaleBuffer
+    newBiaseBuffer = inNewBiaseBuffer
+    filterSize = inFilterSize
+  }
 }

 class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-    required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
-        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
-        } else if testParam.filterSize.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
-        } else {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
-        }
+  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+    } else if testParam.filterSize.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
    }
+  }
+  
+  var metalParam: MetalConvParam!
+  
+  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
    
-    var metalParam: MetalConvParam!
-
-    required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
-        
-        if param.filter.width == 1 && param.filter.height == 1 {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
-        } else if param.filter.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
-        } else {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
-        }
-        
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-
-        param.variance.initBuffer(device: device)
-        param.mean.initBuffer(device: device)
-        param.scale.initBuffer(device: device)
-        param.bias.initBuffer(device: device)
-        
-        let offsetX = param.filter.width/2 - Int(param.paddings[0])
-        let offsetY = param.filter.height/2 - Int(param.paddings[1])
-        
-        print("offset x: \(offsetX)")
-        print("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
-        
-        var invs: [P] = []
-        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {            
-            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-            invs.append(P(inv))
-        }
-        
-        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-        
-        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-            newScale[i] = invs[i] * scaleContents[i]
-            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
-        }
-        param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
-        param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
-        
-        newScale.deinitialize(count: param.scale.buffer.length)
-        newScale.deallocate()
-        
-        newBiase.deinitialize(count: param.bias.buffer.length)
-        newBiase.deallocate()
+    if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else {
+      fatalError()
    }
    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    
+    print("offset x: \(offsetX)")
+    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
+    }
+    
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+    
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+      newScale[i] = invs[i] * scaleContents[i]
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
    }
    
-    public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            fatalError()
-        }
-        
-        encoder.setTexture(param.inputTexture, index: 0)
-        encoder.setTexture(param.outputTexture, index: 1)
-        var inMetalParam = param.metalParam
-        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-        encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
-        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
-        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
-        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-        encoder.endEncoding()
+//    var newScaleFP16: UnsafeMutableRawPointer
+//
+//    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
+    
+    
+//    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
+    
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float32 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float16 {
+      
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
    }
+    
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
+    
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
+    
+    newBiase.deinitialize(count: param.bias.buffer.length)
+    newBiase.deallocate()
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
+    }
+    
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -15,33 +15,73 @@
 import Foundation

 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    required init(device: MTLDevice, param: ConvAddParam<P>) {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_3x3_half")
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        super.init(device: device, inFunctionName: "conv_add_5x1_half")
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x5_half")
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
        super.init(device: device, inFunctionName: "conv_add_1x1")
-        let offsetX = param.filter.width/2 - Int(param.paddings[0])
-        let offsetY = param.filter.height/2 - Int(param.paddings[1])
-        
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        
-        print("offset x: \(offsetX)")
-        print("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        super.init(device: device, inFunctionName: "conv_add_5x1")
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x5")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_3x3")
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
    }
    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+//    print(" function: \(functionName)")
+//    print("offset x: \(offsetX)")
+//    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+//    print("metal param: ")
+//    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+import MetalPerformanceShaders
+
+struct ConvBNReluTestParam: TestParam {
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  var metalParam: MetalConvParam
+  let filterBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
+  let filterSize: (width: Int, height: Int, channel: Int)
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+    
+    inputTexture = inInputTexture
+    outputTexture = inOutputTexture
+    metalParam = inMetalParam
+    filterBuffer = inFilterBuffer
+    biaseBuffer = inBiaseBuffer
+    newScaleBuffer = inNewScaleBuffer
+    newBiaseBuffer = inNewBiaseBuffer
+    filterSize = inFilterSize
+  }
+}
+
+class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
+  required init(device: MTLDevice, testParam: ConvBNReluTestParam) {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+    } else if testParam.filterSize.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+    }
+  }
+  
+  var metalParam: MetalConvParam!
+
+  required init(device: MTLDevice, param: ConvBNReluParam<P>) {
+    
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
+    
+    if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else {
+      fatalError()
+    }
+    
+   
+    
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    
+//    print(" param filter width: \(param.filter.width)")
+//    print(" param filter height: \(param.filter.height)")
+//
+//    print(" param paddings: \(param.paddings)")
+//
+//    print("ConvBNReluKernel offset x: \(offsetX)")
+//    print("ConvBNReluKernel offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
+    }
+    
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+    
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+      newScale[i] = invs[i] * scaleContents[i]
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    }
+    
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float32 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float16 {
+      
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
+    }
+    
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
+    
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
+    
+    newBiase.deinitialize(count: param.bias.buffer.length)
+    newBiase.deallocate()
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 2)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
+    }
+    
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
-};
-
-
-kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device half4 *weights [[buffer(1)]],
-                                         const device half4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    half4 output = half4(0.0);
-    
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device half4 *weights [[buffer(1)]],
-                                         const device half4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    half4 output = half4(0.0);
-    
-    half4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    half4 output = half4(0.0);
-    
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = output + biase[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device half *weights [[buffer(1)]],
-                                                   const device half4 *biase [[buffer(2)]],
-                                                   const device float4 *new_scale [[buffer(3)]],
-                                                   const device float4 *new_biase [[buffer(4)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    half4 output = half4(0.0);
-    half4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-
-/*---------------------------------------------*/
-
-
-
-kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = output + biase[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    float4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        float4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -14,38 +14,49 @@

 import Foundation

-
 public struct MetalConvParam {
-    let offsetX: Int16
-    let offsetY: Int16
-    let offsetZ: Int16
-    let strideX: UInt16
-    let strideY: UInt16
-    let paddedZ: UInt16
+  let offsetX: Int16
+  let offsetY: Int16
+  let offsetZ: Int16
+  let strideX: UInt16
+  let strideY: UInt16
+  let dilationX: UInt16
+  let dilationY: UInt16
 }

 class ConvKernel<P: PrecisionType>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    required init(device: MTLDevice, param: ConvParam<P>) {
-        super.init(device: device, inFunctionName: "conv_add_1x1")
-        let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
-        let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
-        let offsetZ = 0.0
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvParam<P>) {
+    param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
+    if param.filter.width == 1 && param.filter.height == 1 {
+      super.init(device: device, inFunctionName: "conv_1x1")
+    } else if param.filter.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_3x3")
+    } else if param.filter.width == 3 && param.filter.height == 3 {
+      super.init(device: device, inFunctionName: "conv_3x3")
+    } else {
+      fatalError(" unsupport ")
    }
+
+    let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
+    let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
+    let offsetZ = 0.0
    
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct MetalConvTransposeParam {
+  let kernelW: UInt16;
+  let kernelH: UInt16;
+  
+  let strideX: UInt16;
+  let strideY: UInt16;
+  
+  let paddingX: UInt16;
+  let paddingY: UInt16;
+  
+  let dilationX: UInt16;
+  let dilationY: UInt16;
+}
+
+class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: MetalConvTransposeParam!
+  required init(device: MTLDevice, param: ConvTransposeParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true)
+    if computePrecision == .Float32 {
+      if param.stride == [2, 2] && param.stride == [2, 2] {
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2")
+      } else {
+        fatalError(" -- conv transpose unsupported yet -- ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.stride == [2, 2] && param.stride == [2, 2] {
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half")
+      } else {
+        fatalError(" -- conv transpose unsupported yet -- ")
+      }
+    } else {
+      fatalError()
+    }
+    
+//    let filter: [Float32] = param.filter.buffer.array()
+//    print(" conv transpose filter")
+//    print(filter)
+    let kernelWidth = UInt16(param.filter.width)
+    let kernelHeight = UInt16(param.filter.height)
+    
+    let strideX = UInt16(param.stride[0])
+    let strideY = UInt16(param.stride[1])
+    let paddingX = UInt16(param.paddings[0])
+    let paddingY = UInt16(param.paddings[1])
+    let dilationX = UInt16(param.dilations[0])
+    let dilationY = UInt16(param.dilations[1])
+    
+    metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -14,13 +14,60 @@

 import Foundation

+struct ElementwiseAddMetalParam {
+  var fast: Int32 = 0
+  var axis: Int32 = 0
+  var ylen: Int32 = 0
+  var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+  var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+  var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+}

 class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
-    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
-        super.init(device: device, inFunctionName: "elementwise_add")
-    }
+  var metalParam: ElementwiseAddMetalParam
+  required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
    
-    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
-        
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "elementwise_add")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "elementwise_add_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
    }
+    encoder.setTexture(param.inputX.metalTexture, index: 0)
+    encoder.setTexture(param.inputY.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 2)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: ElementwiseAddMetalParam
+  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+   
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+    
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
+    
+    if computePrecision == .Float32 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_float")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_element_float")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_prelu_float")
+      }
+    } else if computePrecision == .Float16 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      }
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.inputX.metalTexture, index: 0)
+    encoder.setTexture(param.inputY.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 2)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct FlattenMetalParam {
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
+}
+
+
+class FlattenKernel<P: PrecisionType>: Kernel, Computable{
+  
+  var metalParam: FlattenMetalParam
+  
+  required init(device: MTLDevice, param: FlattenParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    }
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = FlattenMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
+    assert(orank == 2)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernel.swift
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-public protocol TestParam {
-}
-
-public protocol Testable {
-    associatedtype TestParamType: TestParam
-    func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
-    init(device: MTLDevice, testParam: TestParamType)
-}
-
-
-protocol Computable {
-    associatedtype ParamType: OpParam
-    func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
-    init(device: MTLDevice, param: ParamType)
-}
-
-protocol KernelProtocol {
-    var pipline: MTLComputePipelineState { get set }
-    var functionName: String { get set }
-   
-}
-
-open class Kernel {
-    let pipline: MTLComputePipelineState
-    let functionName: String
-    public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
-        pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
-        functionName = inFunctionName
-    }
-}
-
-open class CusomKernel: Kernel {
-    public struct Shape {
-        public let width: Int
-        public let height: Int
-        public let channel: Int
-        public init(inWidth: Int, inHeight: Int, inChannel: Int){
-            width = inWidth
-            height = inHeight
-            channel = inChannel
-        }
-    }
-    let outputTexture: MTLTexture
-    public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.textureType = .type2D
-        textureDesc.width = outputDim.width
-        textureDesc.height = outputDim.height
-        textureDesc.depth = (outputDim.channel + 3) / 4
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.storageMode = .shared
-        outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-
-        super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
-    }
-    
-    func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(inputTexuture, index: 0)
-        encoder.setTexture(outputTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
-        encoder.endEncoding()
-    }
-    
-}
-
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct OutputDim {
-    ushort width;
-    ushort height;
-    ushort strideX;
-    ushort strideY;
-};
-
-kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
-                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                   constant OutputDim &params [[buffer(0)]],
-                   uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
-    const half4 input = inTexture.read(pos);
-    outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
-}
-
-kernel void relu(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    const float4 relu = fmax((float4)input, 0.0);
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void elementwise_add(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                            texture2d_array<half, access::write> outTexture [[texture(1)]],
-                            const device half4 *biasTerms [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    outTexture.write(input, gid.xy, gid.z);
-}
-
-kernel void batchnorm(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      const device half4 * newScale [[buffer(0)]],
-                      const device half4 * newBias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    half4 output = input * newScale[gid.z] + newBias[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-//kernel void texture2d_to_2d_array(texture2d<half, access::read> inTexture [[texture(0)]],
-//                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-//                               uint3 gid [[thread_position_in_grid]]) {
-//    if (gid.x >= inTexture.get_width() ||
-//        gid.y >= inTexture.get_height()){
-//        return;
-//    }
-//    const half4 input = inTexture.read(gid.xy);
-//    outTexture.write(input, gid.xy, 0);
-//}
-
-kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
-                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height()){
-        return;
-    }
-    const float4 input = inTexture.read(gid.xy);
-    outTexture.write(input, gid.xy, 0);
-}
-
-
-kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
-                                  texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height()){
-        return;
-    }
-    const half4 input = inTexture.read(gid.xy);
-    outTexture.write(input, gid.xy, 0);
-}
-
-struct PoolParam {
-    int ksizeX;
-    int ksizeY;
-    int strideX;
-    int strideY;
-    int paddingX;
-    int paddingY;
-    int poolType;
-};
-
-kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                 texture2d_array<float, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int xmin = gid.x * pm.strideX - pm.paddingX;
-    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-    xmin = max(xmin, 0);
-    int ymin = gid.y * pm.strideX - pm.paddingX;
-    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-    ymin = max(ymin, 0);
-    
-    float4 r = 0;
-    if (pm.poolType == 0) {
-        r = inTexture.read(uint2(xmin, ymin), gid.z);
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-            }
-        }
-    } else if (pm.poolType == 1) {
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r += inTexture.read(uint2(x, y), gid.z);
-            }
-        }
-        r /= pm.ksizeX * pm.ksizeY;
-    }
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-
-kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int xmin = gid.x * pm.strideX - pm.paddingX;
-    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-    xmin = max(xmin, 0);
-    int ymin = gid.y * pm.strideX - pm.paddingX;
-    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-    ymin = max(ymin, 0);
-    
-    half4 r = 0;
-    if (pm.poolType == 0) {
-        r = inTexture.read(uint2(xmin, ymin), gid.z);
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-            }
-        }
-    } else if (pm.poolType == 1) {
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r += inTexture.read(uint2(x, y), gid.z);
-            }
-        }
-        r /= pm.ksizeX * pm.ksizeY;
-    }
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                    texture2d_array<float, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    float4 r = inTexture.read(uint2(0, 0), gid.z);
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                    texture2d_array<half, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    half4 r = inTexture.read(uint2(0, 0), gid.z);
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void softmax(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                    texture2d_array<float, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int zsize = inTexture.get_array_size();
-    float maxv = inTexture.read(uint2(0, 0), 0)[0];
-    for (int z = 0; z < zsize; z++) {
-        float4 r = inTexture.read(uint2(0, 0), z);
-        maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3])));
-    }
-    float sum = 0;
-    for (int z = 0; z < zsize; z++) {
-        float4 r = inTexture.read(uint2(0, 0), z);
-        sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv);
-    }
-    float4 rr = inTexture.read(gid.xy, gid.z);
-    rr = exp(rr - maxv) / sum;
-    outTexture.write(rr, gid.xy, gid.z);
-}
-
-
-kernel void softmax_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                    texture2d_array<half, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int zsize = inTexture.get_array_size();
-    half maxv = inTexture.read(uint2(0, 0), 0)[0];
-    for (int z = 0; z < zsize; z++) {
-        half4 r = inTexture.read(uint2(0, 0), z);
-        maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3])));
-    }
-    float sum = 0;
-    for (int z = 0; z < zsize; z++) {
-        half4 r = inTexture.read(uint2(0, 0), z);
-        sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv);
-    }
-    half4 rr = inTexture.read(gid.xy, gid.z);
-    rr = exp(rr - maxv) / sum;
-    outTexture.write(rr, gid.xy, gid.z);
-}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
+  let pipline1: MTLComputePipelineState
+
+  required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
+    
+    param.middleOutput.initBuffer(device: device)
+    param.bboxOutput.initBuffer(device: device)
+    if computePrecision == .Float32 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result")
+    } else if computePrecision == .Float16 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result_half")
+    } else {
+      fatalError( " unsupport precision " )
+    }
+    
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.scores.metalTexture, index: 0)
+    encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
+    encoder.endEncoding()
+    
+    guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
+    encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
+    encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
+    encoderBox.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -15,46 +15,57 @@
 import Foundation

 struct PoolMetalParam {
-    let ksizeX: Int32
-    let ksizeY: Int32
-    let strideX: Int32
-    let strideY: Int32
-    let paddingX: Int32
-    let paddingY: Int32
-    let poolType: Int32
+  let ksizeX: Int32
+  let ksizeY: Int32
+  let strideX: Int32
+  let strideY: Int32
+  let paddingX: Int32
+  let paddingY: Int32
+  let poolType: Int32
 }

 class PoolKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        var poolType: Int32
-        switch param.poolType {
-        case "max":
-            poolType = 0
-        case "avg":
-            poolType = 1
-        default:
-            throw PaddleMobileError.predictError(message: " unknown pooltype " + param.poolType)
-        }
-        var pmp = PoolMetalParam.init(
-            ksizeX: param.ksize[0],
-            ksizeY: param.ksize[1],
-            strideX: param.stride[0],
-            strideY: param.stride[1],
-            paddingX: param.padding[0],
-            paddingY: param.padding[1],
-            poolType: poolType
-        )
-        encoder.setBytes(&pmp, length: MemoryLayout<PoolMetalParam>.size, index: 0)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  var metalParam: PoolMetalParam
+  required init(device: MTLDevice, param: PoolParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    
+    var poolType: Int32
+    switch param.poolType {
+    case "max":
+      poolType = 0
+    case "avg":
+      poolType = 1
+    default:
+      fatalError()
    }
+    metalParam = PoolMetalParam.init(
+      ksizeX: param.ksize[0],
+      ksizeY: param.ksize[1],
+      strideX: param.stride[0],
+      strideY: param.stride[1],
+      paddingX: param.padding[0],
+      paddingY: param.padding[1],
+      poolType: poolType
+    )
    
-    required init(device: MTLDevice, param: PoolParam<P>) {
-        super.init(device: device, inFunctionName: "pool")
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "pool")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "pool_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PreluKernel<P: PrecisionType>: Kernel, Computable{
+  required init(device: MTLDevice, param: PreluParam<P>) {
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "prelu_channel")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "prelu_element")
+      } else {
+        super.init(device: device, inFunctionName: "prelu_other")
+      }
+    } else if computePrecision == .Float16 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "prelu_channel_half")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "prelu_element_half")
+      } else {
+        super.init(device: device, inFunctionName: "prelu_other_half")
+      }
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct PriorBoxMetalParam {
+  let offset: Float32
+  let stepWidth: Float32
+  let stepHeight: Float32
+  let minSize: Float32
+  let maxSize: Float32
+  let imageWidth: Float32
+  let imageHeight: Float32
+  let clip: Bool
+  let numPriors: uint
+  let aspecRatiosSize: uint
+  let minSizeSize: uint
+  let maxSizeSize: uint
+}
+
+class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: PriorBoxMetalParam!
+  
+  required init(device: MTLDevice, param: PriorBoxParam<P>) {
+    
+    let originDim = param.output.tensorDim;
+    
+    param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    
+    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
+    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+    
+    
+    if computePrecision == .Float32 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder")
+      } else {
+        super.init(device: device, inFunctionName: "prior_box")
+      }
+      
+    } else if computePrecision == .Float16 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half")
+      } else {
+        super.init(device: device, inFunctionName: "prior_box_half")
+      }
+    } else {
+      fatalError()
+    }
+    
+    
+    guard param.minSizes.count == 1 else {
+      fatalError(" need implement ")
+    }
+    
+//    let n = 1
+//    let h = param.output.dim[1]
+//    let w = param.output.dim[2]
+//    let c = param.output.dim[3] * param.output.dim[0]
+//
+//    param.output.dim = Dim.init(inDim: [n, h, w, c])
+//    param.output.transpose = [0, 1, 2, 3]
+    
+    let imageWidth = Float32(param.inputImage.padToFourDim[3])
+    let imageHeight = Float32(param.inputImage.padToFourDim[2])
+    
+    let featureWidth = param.input.padToFourDim[3]
+    let featureHeight = param.input.padToFourDim[2]
+    
+    if param.stepW == 0 || param.stepH == 0 {
+      param.stepW = Float32(imageWidth) / Float32(featureWidth)
+      param.stepH = Float32(imageHeight) / Float32(featureHeight)
+    }
+    
+    var outputAspectRatior: [Float32] = []
+    outputAspectRatior.append(1.0)
+    
+    let epsilon = 1e-6
+    for ar in param.aspectRatios {
+      var alreadyExist = false
+      for outputAr in outputAspectRatior {
+        if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
+          alreadyExist = true
+          break
+        }
+      }
+      
+      if !alreadyExist {
+        outputAspectRatior.append(ar)
+      }
+      if param.flip {
+        outputAspectRatior.append(1.0 / ar)
+      }
+    }
+    
+    if computePrecision == .Float16 {
+      let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
+      float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
+      param.newAspectRatios = buffer
+
+    } else if computePrecision == .Float32 {
+      let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
+      param.newAspectRatios = buffer
+    } else {
+      fatalError()
+    }
+    
+    let aspectRatiosSize = uint(outputAspectRatior.count)
+    
+    let maxSizeSize: uint = uint(param.maxSizes.count)
+    let minSizeSize: uint = uint(param.minSizes.count)
+    
+    let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
+    
+    let minSize = param.minSizes.last ?? 0.0
+    let maxSize = param.maxSizes.last ?? 0.0
+    
+    metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
+    
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setTexture(param.outputVariances.metalTexture, index: 2)
+    
+    encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
+    
+    encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
+    
+    encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
@@ -15,17 +15,23 @@
 import Foundation

 class ReluKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
    }
-    
-    required init(device: MTLDevice, param: ReluParam<P>) {
-        super.init(device: device, inFunctionName: "relu")
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ReluParam<P>) {
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "relu")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "relu_half")
+    } else {
+      fatalError()
    }
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -14,18 +14,84 @@

 import Foundation

+struct ReshapeMetalParam {
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
+}
+
+struct ReshapeTestParam: TestParam {
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  let param: ReshapeMetalParam
+}
+
 class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
-    required init(device: MTLDevice, param: ReshapeParam<P>) {
-        super.init(device: device, inFunctionName: "reshape")
+  
+  var metalParam: ReshapeMetalParam
+  
+  required init(device: MTLDevice, param: ReshapeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
    }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
    }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = ReshapeMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  required init(device: MTLDevice, testParam: ReshapeTestParam) {
+    metalParam = ReshapeMetalParam.init(
+    idim: (0, 0, 0, 0),
+    itrans: (0, 0, 0, 0),
+    odim: (0, 0, 0, 0),
+    otrans: (0, 0, 0, 0)
+    )
+    super.init(device: device, inFunctionName: "reshape")
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+//  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      fatalError()
+//    }
+//    encoder.setTexture(testParam.inputTexture, index: 0)
+//    encoder.setTexture(testParam.outputTexture, index: 1)
+//    var pm: ReshapeMetalParam = testParam.param
+//    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+//    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+//    encoder.endEncoding()
+//  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ResizeBilinearMetalParam {
+  var ratio_h: Float32
+  var ratio_w: Float32
+}
+
+class ResizeBilinearKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
+    let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
+    var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ResizeBilinearParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "resize_bilinear")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "resize_bilinear_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-//
-//import Foundation
-//import MetalPerformanceShaders
-//
-//
-//struct ResizeParam: OpParam{
-//    typealias OutputType = <#type#>
-//    
-//    typealias ParamPrecisionType = <#type#>
-//    
-//    let input: MTLTexture
-//    let output: MTLTexture
-//    let expectDim: Dim
-//}
-//
-//struct OutputDim {
-//    let width: UInt16
-//    let height: UInt16
-//    let strideX: UInt16
-//    let strideY: UInt16
-//}
-//
-//class ResizeKernel<P: PrecisionType>: Kernel, Computable{
-//    var lanczos: MPSImageLanczosScale
-//    required init(device: MTLDevice, param: ResizeParam) {
-//        lanczos = MPSImageLanczosScale.init(device: device)
-//        super.init(device: device, inFunctionName: "resize")
-//    }
-//    func compute(commandBuffer: MTLCommandBuffer, param: ResizeParam) throws {
-////        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-////            throw PaddleMobileError.predictError(message: " encode is nil")
-////        }
-//        lanczos.encode(commandBuffer: commandBuffer, sourceTexture: param.input, destinationTexture: param.output)
-//        
-////        encoder.setTexture(param.input, index: 0)
-////        encoder.setTexture(param.output, index: 1)
-////        let strideX = param.input.width/param.expectDim[2]
-////        let strideY = param.input.height/param.expectDim[1]
-////        var outputDim = OutputDim.init(width: UInt16(param.expectDim[1]), height: UInt16(param.expectDim[2]), strideX: UInt16(strideX), strideY: UInt16(strideY))
-////        encoder.setBytes(&outputDim, length: MemoryLayout<OutputDim>.size, index: 0)
-////        encoder.dispatch(computePipline: pipline, outTexture: param.output)
-////        encoder.endEncoding()
-//    }
-//    
-//
-//    
-//    
-//}
-
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ShapeMetalParam {
+}
+
+class ShapeKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
+//    print("shape compute")
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      throw PaddleMobileError.predictError(message: " encode is nil")
+//    }
+//    encoder.setTexture(param.output.metalTexture, index: 0)
+//    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ShapeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "shape")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "shape_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -14,19 +14,38 @@

 import Foundation

+struct SoftmaxMetalParam {
+  let N: Int32
+  let K: Int32
+}
+
 class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  
+  var metalParam: SoftmaxMetalParam
+  required init(device: MTLDevice, param: SoftmaxParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    metalParam = SoftmaxMetalParam.init(
+      N: Int32(param.input.tensorDim[0]),
+      K: Int32(param.input.tensorDim[1])
+    )
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "softmax_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "softmax_half")
+    } else {
+      fatalError()
    }
-    
-    required init(device: MTLDevice, param: SoftmaxParam<P>) {
-        super.init(device: device, inFunctionName: "softmax")
+  }
+
+  func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct SplitMetalParam {
+  var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+  var axis: Int32 = 0
+  var offset: Int32 = 0
+  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+}
+
+class SplitKernel<P: PrecisionType>: Kernel, Computable{
+  var smp: SplitMetalParam
+  func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    for i in 0..<param.outputList.count {
+      encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+    }
+    encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: SplitParam<P>) {
+    //     param.output.initTexture(device: device, computePrecision: computePrecision)
+    let num = param.outputList.count
+    let rank = param.input.tensorDim.cout()
+    assert(num >= 2 && num <= 4)
+    for output in param.outputList {
+      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    }
+    smp = SplitMetalParam.init()
+    smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
+    smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
+    for i in 0..<4 {
+      if param.input.transpose[i] == smp.axis {
+        smp.axis = Int32(i)
+        break
+      }
+    }
+    smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
+    var vdim: [Int32] = [0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
+    }
+    smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
+    var v = "normal"
+    if rank == 4 {
+      if smp.axis == 1 {
+        v = "y"
+      } else if smp.axis == 2 {
+        v = "x"
+      }
+    } else if rank == 3 {
+      if smp.axis == 2 {
+        v = "y"
+      } else if smp.axis == 3 {
+        v = "x"
+      }
+    } else if rank == 2 {
+      if smp.axis == 2 {
+        v = "y"
+      }
+    }
+    if v == "normal" {
+      fatalError("split unsupported")
+    }
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -15,23 +15,31 @@
 import Foundation

 struct Texture2DTo2DArrayParam {
-    let input: MTLTexture
-    let output: MTLTexture
-    let expectDim: Dim
+  let input: MTLTexture
+  let output: MTLTexture
+  let expectDim: Dim
 }

 class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(param.input.mtlTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
-        encoder.endEncoding()
+  func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
    }
-    
-    required init(device: MTLDevice, param: FeedParam<P>) {
-        super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+    encoder.setTexture(param.input.mtlTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: FeedParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array_half")
+    } else if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+    } else {
+      fatalError()
    }
+    
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct TransposeMetalParam {
+  var iC: Int32 = 0
+  var oC: Int32 = 0
+  var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+}
+
+class TransposeKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: TransposeMetalParam = TransposeMetalParam.init()
+  required init(device: MTLDevice, param: TransposeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    let rank = param.input.tensorDim.cout()
+    var axis: [Int] = [0, 1, 2, 3]
+    for i in 0..<param.axis.count {
+      axis[4-rank+i] = 4 - rank + Int(param.axis[i])
+    }
+
+    var naxis: [Int] = [0, 0, 0, 0]
+    for i in 0..<4 {
+      for j in 0..<4 {
+        if param.input.transpose[j] == axis[i] {
+          naxis[i] = j
+          break
+        }
+      }
+    }
+    metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
+    metalParam.oC = Int32(param.output.dim[3])
+    metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
+    var kernelFunc = "transpose_undefined"
+    if computePrecision == .Float16 {
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_half"
+      } else {
+        kernelFunc = "transpose_\(rank)_half"
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_float"
+      } else {
+        kernelFunc = "transpose_\(rank)_float"
+      }
+    } else {
+      fatalError()
+    }
+    print("===========>", kernelFunc)
+    print(metalParam)
+    super.init(device: device, inFunctionName: kernelFunc)
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+  
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      const device float4 * nscale [[buffer(0)]],
+                      const device float4 * nbias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  float4 output = input * nscale[gid.z] + nbias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      const device half4 * newScale [[buffer(0)]],
+                      const device half4 * newBias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  half4 output = input * newScale[gid.z] + newBias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
+//
+//  BatchNormRelu.metal
+//  paddle-mobile
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct MetalConvParam {
+    short offsetX;
+    short offsetY;
+    short offsetZ;
+    ushort strideX;
+    ushort strideY;
+};
+
+kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         const device float4 *new_scale [[buffer(0)]],
+                                         const device float4 *new_biase [[buffer(1)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    float4 input;
+    float4 output;
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    input = inTexture.sample(sample, gid.x, gid.y, gid.z);
+    output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
+
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                     texture2d_array<P, access::write> output [[texture(1)]],
+                     constant bilinear_interp_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    P w = gid.x * pm.ratio_w;
+    P h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    P w1lambda = w - w0, h1lambda = h - h0;
+    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
+    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
+    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
+    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+      + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct bilinear_interp_param {
+  float ratio_h;
+  float ratio_w;
+};
+
+#define P float
+#include "BilinearInterp.inc.metal"
+#undef P
+
+#define P half
+#include "BilinearInterp.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
+                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
+                     texture2d_array<P, access::read> targetBox [[texture(2)]],
+                     texture2d_array<P, access::write> output[[texture(3)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) t;
+  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
+  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
+  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
+  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
+  
+  P px = (p.x + p.z) / 2;
+  P py = (p.y + p.w) / 2;
+  P pw = p.z - p.x;
+  P ph = p.w - p.y;
+  
+  P tx = pv.x * t.x * pw + px;
+  P ty = pv.y * t.y * ph + py;
+  P tw = exp(pv.z * t.z) * pw;
+  P th = exp(pv.w * t.w) * ph;
+  
+  VECTOR(P, 4) r;
+  r.x = tx - tw / 2;
+  r.y = ty - th / 2;
+  r.z = tx + tw / 2;
+  r.w = ty + th / 2;
+
+  output.write(r, gid.xy, gid.z);
+}
+
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+#define P float
+#include "BoxCoder.inc.metal"
+#undef P
+#define P half
+#include "BoxCoder.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = abcd[2] = 0;
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = 0;
+  abcd[2] = xyzn[1];
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
+  abcd[0] = 0;
+  abcd[3] = xyzn[0];
+  abcd[2] = xyzn[1];
+  abcd[1] = xyzn[2] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+
+inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
+  xyzn[1] = xyzn[2] = 0;
+  xyzn[0] = abcd[3] / 4;
+  xyzn[1] = abcd[3] % 4;
+}
+inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
+  xyzn[2] = 0;
+  xyzn[1] = abcd[2];
+  xyzn[0] = abcd[3] / 4;
+  xyzn[3] = abcd[3] % 4;
+}
+inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[3];
+  xyzn[1] = abcd[2];
+  xyzn[2] = abcd[1] / 4;
+  xyzn[3] = abcd[1] % 4;
+}
+inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+
+inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+
+inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+
+inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
+  int32_t r = abcd[0];
+  r = r * dim[1] + abcd[1];
+  r = r * dim[2] + abcd[2];
+  r = r * dim[3] + abcd[3];
+  return r;
+}
+
+inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
+  abcd[3] = ind % dim[3]; ind /= dim[3];
+  abcd[2] = ind % dim[2]; ind /= dim[2];
+  abcd[1] = ind % dim[1]; ind /= dim[1];
+  abcd[0] = ind;
+}
+
+inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[i] = ipos[trans[i]];
+  }
+}
+
+inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[trans[i]] = ipos[i];
+  }
+}
+
+
+struct MetalConvParam {
+  short offsetX;
+  short offsetY;
+  short offsetZ;
+  ushort strideX;
+  ushort strideY;
+  ushort dilationX;
+  ushort dilationY;
+};
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+
+#if V == VNORMAL
+//kernel void FUNC(concat, R, N, normal, P)(array<texture2d_array<P, access::read>, N> in [[texture(0)]],
+//                                     texture2d_array<P, access::read> out_x [[texture(N)]],
+//                                     texture2d_array<P, access::write> out [[texture(N+1)]],
+//                                     constant ConcatParam & pm [[buffer(0)]],
+//                                     uint3 gid [[thread_position_in_grid]]) {
+//}
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif
+                                          texture2d_array<P, access::read> inx [[texture(N)]],
+                                          texture2d_array<P, access::write> out [[texture(N+1)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+
+   ConcatParam cp = pm;
+   int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+   VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
+   for (int i = 0; i < 4; i++) {
+     xyzn[3] = i;
+#if R == 4
+     xyzn2abcd_4(cp.odim[3], xyzn, abcd);
+#else
+     FUNC_R(xyzn2abcd, R)(xyzn, abcd);
+#endif
+     int k = abcd[cp.axis] - cp.offset;
+     if (k < 0) continue;
+     int j = 0;
+     for (; j < N; j++) {
+       if (k < cp.vdim[j]) {
+         break;
+       }
+       k -= cp.vdim[j];
+     }
+     if (j == N) {
+       continue;
+     }
+     int ta = cp.odim[cp.axis];
+     abcd[cp.axis] = k;
+     cp.odim[cp.axis] = cp.vdim[j];
+#if R == 4
+     abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
+#else
+     FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
+#endif
+     cp.odim[cp.axis] = ta;
+     switch (j) {
+       case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+       case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#if N >= 3
+       case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 4
+       case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 5
+       case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 6
+       case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+     }
+   }
+   out.write(r, gid.xy, gid.z);
+}
+
+#endif // V == NORMAL
+
+
+
+#if V == VX
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                          texture2d_array<P, access::write> out [[texture(N)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+  int x = gid.x - pm.offset;
+  if (x < 0) return;
+  if (x < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= pm.vdim[0];
+  if (x < pm.vdim[1]) {
+    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  x -= pm.vdim[1];
+  if (x < pm.vdim[2]) {
+    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= pm.vdim[2];
+  if (x < pm.vdim[3]) {
+    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  x -= pm.vdim[3];
+  if (x < pm.vdim[4]) {
+    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  x -= pm.vdim[4];
+  if (x < pm.vdim[5]) {
+    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VX
+
+#if V == VY
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int y = gid.y - pm.offset;
+  if (y < 0) return;
+  if (y < pm.vdim[0]) {
+    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= pm.vdim[0];
+  if (y < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  y -= pm.vdim[1];
+  if (y < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= pm.vdim[2];
+  if (y < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  y -= pm.vdim[3];
+  if (y < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  y -= pm.vdim[4];
+  if (y < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VY
+
+#if V == VZ
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int z = gid.z - pm.offset;
+  if (z < 0) return;
+  if (z < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  z -= pm.vdim[0];
+  if (z < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  z -= pm.vdim[1];
+  if (z < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  z -= pm.vdim[2];
+  if (z < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  z -= pm.vdim[3];
+  if (z < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  z -= pm.vdim[4];
+  if (z < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VZ
+
+
+#undef VV
+#endif // #ifdef P
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ConcatParam {
+  int32_t odim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[6];
+};
+
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+
+// >> fast mode
+// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half}
+// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half}
+// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half}
+// >> normal mode (loop mode)
+// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x)
+// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y)
+// genet: (R=4, N=2, V=normal)
+
+// ssd-ar: (R=3, N=5, V=x)
+#define V VX
+  #define R 3
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd-ar: (R=2, N=5, V=x)
+#define V VX
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd-ar: (R=4, N=3, V=z)
+#define V VZ
+  #define R 4
+    #define N 3
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+// ssd: (R=2, N=6, V=y)
+#define V VY
+  #define R 2
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd: (R=3, N=6, V=y)
+#define V VY
+  #define R 3
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+#define V VNORMAL
+  #define R 4
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+#define V VY
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+
+kernel void conv_add_batch_norm_relu_1x1_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+
+
+/*---------------------------------------------*/
+
+
+
+kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device float *weights [[buffer(1)]],
+                                                   const device float4 *biase [[buffer(2)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - convAdd
+kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 9;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  float4 input[9];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_y = param.dilationY;
+  float4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  float4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device float *weights [[buffer(1)]],
+                                   const device float4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = biase[gid.z];
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+#pragma mark - half
+
+kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device half *weights [[buffer(1)]],
+                                   const device half4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  half4 output = biase[gid.z];
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_y = param.dilationY;
+  half4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  half4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<float, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device float4 *weights [[buffer(1)]],
+                              const device float4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 9;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  float4 input[9];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  //  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#include "Macro.metal"
+
+
+#pragma mark - convAdd
+kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  VECTOR(P, 4) output = biase[gid.z];
+  
+  VECTOR(P, 4) input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
+    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  
+//  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device VECTOR(P, 4) *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+     const device P *alpha [[buffer(3)]],
+#endif
+     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 9;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];
+
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+
+  VECTOR(P, 4) input[9];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+
+    for (int j = 0; j < 9; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                        const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];;
+
+  ushort dilation_y = param.dilationY;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+
+kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];
+
+  ushort dilation_x = param.dilationX;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device P *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+    const device P *alpha [[buffer(3)]],
+#endif
+    uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  VECTOR(P, 4) output = biase[gid.z];
+  VECTOR(P, 4) inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    VECTOR(P, 4) input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+#endif
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+#define P float
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - conv bn relu
+kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device float *weights [[buffer(1)]],
+                                               const device float4 *new_scale [[buffer(2)]],
+                                               const device float4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#pragma mark - half
+kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device half *weights [[buffer(1)]],
+                                               const device half4 *new_scale [[buffer(2)]],
+                                               const device half4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+// conv
+#pragma mark -- conv
+kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device float *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device half *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+    output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct MetalConvTransposeParam{
+  ushort kernelW;
+  ushort kernelH;
+  
+  ushort strideX;
+  ushort strideY;
+  
+  ushort paddingX;
+  ushort paddingY;
+  
+  ushort dilationX;
+  ushort dilationY;
+};
+
+kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device float4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    
+    float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    
+    float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    
+    output.x += dot(input, kernel_slice0);
+    
+    output.y += dot(input, kernel_slice1);
+    
+    output.z += dot(input, kernel_slice2);
+    
+    output.w += dot(input, kernel_slice3);
+  }
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device half4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    
+    half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    
+    half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    
+    output.x += dot(float4(input), float4(kernel_slice0));
+    
+    output.y += dot(float4(input), float4(kernel_slice1));
+    
+    output.z += dot(float4(input), float4(kernel_slice2));
+    
+    output.w += dot(float4(input), float4(kernel_slice3));
+  }
+  
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+//                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+//                           constant MetalConvTransposeParam &param [[buffer(0)]],
+//                           const device float4 *weights [[buffer(1)]],
+//                           uint3 gid [[thread_position_in_grid]]){
+//  if (gid.x >= outTexture.get_width() ||
+//      gid.y >= outTexture.get_height() ||
+//      gid.z >= outTexture.get_array_size()) {
+//    return;
+//  }
+//
+//  int input_array_size = inTexture.get_array_size();
+//
+//  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
+//
+//  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+//
+//  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+//
+//  float4 output;
+//
+//  for (int w = 0; w < param.kernelW; ++w) {
+//    int top = gid.x - w * param.dilationX + param.paddingX;
+//    int input_x = top / param.strideX;
+//    if (top < 0 || input_x >= int(inTexture.get_width())) {
+//      continue;
+//    }
+//
+//    for (int h = 0; h < param.kernelH; ++h) {
+//      int top_y = gid.y - h * param.dilationY + param.paddingY;
+//      int input_y = top_y / param.strideY;
+//      if (top_y < 0 || input_y >= int(inTexture.get_height())) {
+//        continue;
+//      }
+//
+//      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
+//
+//      for (int slice = 0; slice < input_array_size; ++slice) {
+//
+//        float4 input;
+//        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
+//        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
+//
+//        input = inTexture.sample(sample, float2(input_x,    input_y), slice);
+//        output.x += dot(input, kernel_slice);
+//        output.y += dot(input, kernel_slice1);
+//        output.z += dot(input, kernel_slice2);
+//        output.w += dot(input, kernel_slice3);
+//      }
+//    }
+//  }
+//
+//  outTexture.write(output, gid.xy, gid.z);
+//}
+//
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+
+kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
+                            texture2d_array<float, access::read> inputY [[texture(1)]],
+                            texture2d_array<float, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  float4 rx, ry;
+
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  float4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
+                            texture2d_array<half, access::read> inputY [[texture(1)]],
+                            texture2d_array<half, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  half4 rx, ry;
+
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  half4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#include <metal_stdlib>
+#include "Macro.metal"
+
+using namespace metal;
+
+kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
+                                 texture2d_array<P, access::read> inputY [[texture(1)]],
+                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
+                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+#ifdef PRELU_CHANNEL
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_ELEMENT
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_OTHER
+                                 const device P *alpha [[buffer(1)]],
+#endif
+                                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  VECTOR(P, 4) rx, ry;
+  
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+    } else {
+      rx = inputX.read(gid.xy, gid.z);
+      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+      int32_t yshift = 4 - pm.ylen - pm.axis;
+      for (int n = 0; n < 4; n++) {
+        x_xyzn[3] = n;
+        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+        invtrans(xtrans, x_abcd, t_abcd);
+        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+          y_abcd[yshift+k] = t_abcd[k];
+        }
+        trans(ytrans, y_abcd, t_abcd);
+        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+      }
+  }
+  VECTOR(P, 4) output = rx + ry;
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+
+#define P float
+
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+#define PRELU_CHANNEL channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                       device float *output [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+}
+
+
+kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                  device float * output [[buffer(0)]],
+                  uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+  
+}
+
+kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+}
+
+kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+// 占位函数, 啥也没干
+kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+}
+
+struct OutputDim {
+  ushort width;
+  ushort height;
+  ushort strideX;
+  ushort strideY;
+};
+
+kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
+                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                   constant OutputDim &params [[buffer(0)]],
+                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
+  const half4 input = inTexture.read(pos);
+  outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
+}
+
+
+kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
+                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const float4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
+
+kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const half4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+  
+}
+
+
+kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+}
+
+kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float4 *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input;
+}
+
+kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           device float4 *output [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = float4(input);
+}
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct PoolParam {
+  int ksizeX;
+  int ksizeY;
+  int strideX;
+  int strideY;
+  int paddingX;
+  int paddingY;
+  int poolType;
+};
+
+kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                 constant PoolParam &pm [[buffer(0)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  float4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      constant PoolParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  half4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           const device float4 *alpha [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float4 alpha_value = alpha[gid.z];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  float4 alpha_value = alpha[alpha_to + gid.z];
+
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float alpha_value = alpha[0];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half4 alpha_value = alpha[gid.z];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  half4 alpha_value = alpha[alpha_to + gid.z];
+  
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                        const device half *alpha [[buffer(0)]],
+                        uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half alpha_value = alpha[0];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct PriorBoxMetalParam {
+  float offset;
+  float stepWidth;
+  float stepHeight;
+  float minSize;
+  float maxSize;
+  float imageWidth;
+  float imageHeight;
+  
+  bool clip;
+  
+  uint numPriors;
+  uint aspecRatiosSize;
+  uint minSizeSize;
+  uint maxSizeSize;
+};
+
+kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    float ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(res, gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(max_box, gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+
+
+kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                      const device half *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    half ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+      
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+
+
+
+kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  
+  
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  
+
+  
+  
+  if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
+    
+    int skip = 0;
+    for (int i = 0; i < aspect_to + 1; ++i) {
+      if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
+        skip += 1;
+      }
+    }
+    aspect_to += skip;
+    
+    float ar = aspect_ratios[aspect_to];
+    
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+
+
+kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                           const device half *aspect_ratios [[buffer(0)]],
+                           constant PriorBoxMetalParam &param [[buffer(1)]],
+                           const device float4 *variances [[buffer(2)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  
+  
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  
+  if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
+    float ar = aspect_ratios[aspect_to];
+    
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                 texture2d_array<half, access::write> outTexture [[texture(1)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(half4(relu), gid.xy, gid.z);
+}
+
+kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(float4(relu), gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+
+#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant ReshapeParam &rp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
+  ReshapeParam lrp = rp;
+  int oC = lrp.odim[lrp.otrans[3]];
+  int iC = lrp.idim[lrp.itrans[3]];
+  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
+  VECTOR(P, 4) r;
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if ROUT == 4
+    xyzn2abcd_4(oC, oxyzn, oabcd);
+#else
+    FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
+#endif
+    int tabcd[4];
+    invtrans(lrp.otrans, oabcd, tabcd);
+    int index = abcd2index(lrp.odim, tabcd);
+    if (index < count) {
+      index2abcd(lrp.idim, index, tabcd);
+      trans(lrp.itrans, tabcd, iabcd);
+#if RIN == 4
+      abcd2xyzn_4(iC, iabcd, ixyzn);
+#else
+      FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
+#endif
+      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+    } else {
+      r[n] = 0;
+    }
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONRITIONS OF ANY KINR, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ReshapeParam {
+  int32_t idim[4];
+  int32_t itrans[4];
+  int32_t odim[4];
+  int32_t otrans[4];
+};
+
+#define P float
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#undef P
+
+#define P half
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct resize_bilinear_param {
+//  int32_t out_h;
+//  int32_t out_w;
+  float ratio_h;
+  float ratio_w;
+};
+
+kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
+                     texture2d_array<float, access::write> output [[texture(2)]],
+                     constant resize_bilinear_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  float4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    float w = gid.x * pm.ratio_w;
+    float h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    float w1lambda = w - w0, h1lambda = h - h0;
+    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    float4 r0 = input.read(uint2(w0, h0), gid.z);
+    float4 r1 = input.read(uint2(w1, h0), gid.z);
+    float4 r2 = input.read(uint2(w0, h1), gid.z);
+    float4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+
+kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
+                            texture2d_array<half, access::write> output [[texture(2)]],
+                            constant resize_bilinear_param & pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  
+  half4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    half w = gid.x * pm.ratio_w;
+    half h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    half w1lambda = w - w0, h1lambda = h - h0;
+    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    half4 r0 = input.read(uint2(w0, h0), gid.z);
+    half4 r1 = input.read(uint2(w1, h0), gid.z);
+    half4 r2 = input.read(uint2(w0, h1), gid.z);
+    half4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+  output.write(r, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void shape() {
+}
+kernel void shape_half() {
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant SoftmaxParam &sp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+//  int zsize = inTexture.get_array_size();
+  P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
+  int group = sp.K / 4;
+  int remain = sp.K % 4;
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
+  }
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      maxv = max(maxv, r[i]);
+    }
+  }
+  VECTOR(P, 4) rsum = {0, 0, 0, 0};
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    rsum += exp(r - maxv);
+  }
+  P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      sum += exp(r[i] - maxv);
+    }
+  }
+  VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
+  rr = exp(rr - maxv) / sum;
+  outTexture.write(rr, gid.xy, gid.z);
+}
+
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct SoftmaxParam {
+  int N;
+  int K;
+};
+
+#define P float
+#include "Softmax.inc.metal"
+#undef P
+
+#define P half
+#include "Softmax.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+
+#if V == VY
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                 texture2d_array<P, access::write> out1 [[texture(1)]],
+                                 texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                 texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                 texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                 constant SplitParam &sp [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int y = gid.y - sp.offset;
+  if (y < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= sp.vdim[0];
+  if (y < sp.vdim[1]) {
+    out2.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#if N >= 3
+  y -= sp.vdim[1];
+  if (y < sp.vdim[2]) {
+    out3.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= sp.vdim[2];
+  if (y < sp.vdim[3]) {
+    out4.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VY
+
+
+#if V == VX
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                     texture2d_array<P, access::write> out1 [[texture(1)]],
+                                     texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                     texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                     texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                     constant SplitParam &sp [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int x = gid.x;
+  if (x < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= sp.vdim[0];
+  if (x < sp.vdim[1]) {
+    out2.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#if N >= 3
+  x -= sp.vdim[1];
+  if (x < sp.vdim[2]) {
+    out3.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= sp.vdim[2];
+  if (x < sp.vdim[3]) {
+    out4.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VX
+
+
+
+#undef VV
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct SplitParam {
+  int32_t idim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[4];
+};
+
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+
+// only support split_{2, 3, 4}_{2, 3, 4}_y_{float, half}
+// only support split_{3, 4}_{2, 3, 4}_x_{float, half}
+
+
+//// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+//// ssd-ar: (R=2, N=2, V=y)
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+
+#define FUNC(f, r, p) CONCAT3_(f, r, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                      texture2d_array<P, access::write> outTexture [[texture(1)]],
+                      constant TransposeParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
+  int iabcd[4], oabcd[4], ixyzn[4];
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if R == 4
+    xyzn2abcd_4(pm.oC, oxyzn, iabcd);
+#endif // R == 4
+#if R == 3
+    xyzn2abcd_3(oxyzn, oabcd);
+#endif // R == 3
+#if R == 2
+    xyzn2abcd_2(oxyzn, oabcd);
+#endif // R == 2
+    iabcd[pm.axis[0]] = oabcd[0];
+    iabcd[pm.axis[1]] = oabcd[1];
+    iabcd[pm.axis[2]] = oabcd[2];
+    iabcd[pm.axis[3]] = oabcd[3];
+#if R == 4
+    abcd2xyzn_4(pm.iC, iabcd, ixyzn);
+#endif // R == 4
+#if R == 3
+    abcd2xyzn_3(iabcd, ixyzn);
+#endif // R == 3
+#if R == 2
+    abcd2xyzn_2(iabcd, ixyzn);
+#endif // R == 2
+    r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct TransposeParam {
+  int iC;
+  int oC;
+  int axis[4];
+};
+
+kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+
+#define R 4
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+
+#define R 3
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+
+#define R 2
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
--- a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
+      bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
+      output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      
+      middleOutput = FetchHolder.init(inCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim.dims)
+      
+      bboxOutput = FetchHolder.init(inCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim.dims)
+    } catch let error {
+      throw error
+    }
+  }
+  var bboxOutput: FetchHolder
+  var middleOutput: FetchHolder
+  let scores: Texture<P>
+  let bboxes: Texture<P>
+  var output: Texture<P>
+}
+
+class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
+
+  func inputVariant() -> [String : [Variant]] {
+    return ["Scores" : [para.middleOutput], "BBoxes" : [para.bboxOutput]]
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let _ {
+      fatalError()
+    }
+  }
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  typealias OpType =  MulticlassNMSOp<P>
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+
+  }
+  
+  func delogOutput() {
+    print(" nms - output: ")
+    print(para.bboxes.metalTexture.float32Array().strideArray())
+  }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
@@ -15,54 +15,60 @@
 import Foundation

 class PoolParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
-            ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
-            stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
-            globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
-        } catch let error {
-            throw error
-        }
-//        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
+      ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
+      stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
+      globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
+      assert(input.transpose == [0, 2, 3, 1])
+    } catch let error {
+      throw error
    }
-    let input: Texture<P>
-    var output: Texture<P>
-    var ksize: [Int32]
-    var stride: [Int32]
-    var padding: [Int32]
-    var poolType: String
-    var ceilMode: Bool
-    var globalPooling: Bool
+    //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  var ksize: [Int32]
+  var stride: [Int32]
+  var padding: [Int32]
+  var poolType: String
+  var ceilMode: Bool
+  var globalPooling: Bool
 }

 class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    typealias OpType = PoolOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+  
+  typealias OpType = PoolOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+
    
-    func delogOutput() {
-        print("pool2d delog")
-        let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
-        print(para.ksize)
-        print(para.stride)
-        print(para.padding)
-        print(para.poolType)
-        let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
-    }
+//    print("pool2d delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
+//    print(para.ksize)
+//    print(para.stride)
+//    print(para.padding)
+//    print(para.poolType)
+//    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let mode: String
+  let alpha: Tensor<P>
+  let input: Texture<P>
+  var output: Texture<P>
+}
+
+class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = PreluOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) input: ")
+    print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
+    
+    print(" \(type) Alpha: ")
+    let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
+    
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+  
+//    print("softmax delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
+//    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PriorBoxParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
+    } catch _ {
+    }
+    
+    do {
+      input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
+      inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
+      outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
+      minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
+      maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
+      aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
+      variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
+      flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
+      clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
+      stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
+      stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
+      offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  var min_max_aspect_ratios_order: Bool = false
+  let minSizes: [Float32]
+  let maxSizes: [Float32]
+  let aspectRatios: [Float32]
+  var newAspectRatios: MTLBuffer?
+  let variances: [Float32]
+  let flip: Bool
+  let clip: Bool
+  var stepW: Float32
+  var stepH: Float32
+  let offset: Float32
+  
+  let input: Texture<P>
+  let inputImage: Texture<P>
+  var output: Texture<P>
+  let outputVariances: Texture<P>
+}
+
+class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = PriorBoxOp<P>
+
+  func inferShape() {
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+
+    print(" \(type) output: ")
+    // output
+//    let outputArray = para.output.metalTexture.float32Array()
+//    print(outputArray.strideArray())
+//    let device = para.input.metalTexture!.device
+//    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
+//    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
+//    print("boxes: ")
+//    print(boxes.strideArray())
+//    print("variances: ")
+//    print(variances.strideArray())
+    // output
+    print(" \(type) output: ")
+    
+    let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3]))
+    print(" dim: \(para.output.dim)")
+    print(box.strideArray())
+//    print((0..<box.count).map { (index: $0, value: box[$0])})
+//    print(para.output.realNHWC().strideArray())
+    
+//    let padToFourDim = para.output.padToFourDim
+//    if para.output.transpose == [0, 1, 2, 3] {
+//      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
+//      print(outputArray.strideArray())
+//    } else if para.output.transpose == [0, 2, 3, 1] {
+//      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
+//    } else {
+//      print(" not implement")
+//    }
+    
+//    writeToLibrary(fileName: "box_out", array: outputArray)
+    
+    // output variance
+//    let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in
+//      return o
+//    }
+//
+//    print(" output variance: \(outputVarianceArray)")
+    
+//    writeToLibrary(fileName: "variance_out", array: outputVarianceArray)
+    
+  }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-// http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+

 import Foundation

 class ReluParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
    }
-    let input: Texture<P>
-    var output: Texture<P>
+  }
+  let input: Texture<P>
+  var output: Texture<P>
 }

 class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    typealias OpType = ReluOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+  
+  typealias OpType = ReluOp<P>
+  
+  func inferShape() {
+    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
 }



--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -15,36 +15,63 @@
 import Foundation

 class ReshapeParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
+        
+      var s: [Int] = shape.map { Int($0) }
+      
+      var di = -1
+      var ml = 1
+      for i in 0..<s.count {
+        if s[i] == -1 {
+          di = i
+          continue
        }
+        ml *= s[i]
+      }
+      if di >= 0 {
+        s[di] = input.dim.numel() / ml
+      }
+      output.tensorDim = Dim.init(inDim: s)
+      var dim: [Int] = [1, 1, 1, 1]
+      for i in 0..<s.count {
+        dim[4-s.count+i] = s[i]
+      }
+      output.padToFourDim = Dim.init(inDim: dim)
+      output.dim = output.padToFourDim
+    } catch let error {
+      throw error
    }
-    let input: Texture<P>
-    var output: Texture<P>
+  }
+  let input: Texture<P>
+  let shape: [Int32]
+  var output: Texture<P>
 }

 class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    typealias OpType = ReshapeOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-    }
-    func delogOutput() {
-        print("reshape delog")
-        let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false)
-        let _: P? = para.output.metalTexture.logDesc(header: "reshape output: ", stridable: false)
+  
+  typealias OpType = ReshapeOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  func delogOutput() {
+    print("reshape delog")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+//    print(outputArray)
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class ResizeBilinearParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
+//      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+//        fatalError()
+//      }
+      output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+      out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let out_h: Int32
+  let out_w: Int32
+}
+
+class ResizeBilinearOp<P: PrecisionType>: Operator<ResizeBilinearKernel<P>, ResizeBilinearParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ResizeBilinearOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+  }
+  
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ShapeParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  var output: Texture<P>
+  let input: Texture<P>
+}
+
+class ShapeOp<P: PrecisionType>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ShapeOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+  }
+  
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
@@ -15,36 +15,48 @@
 import Foundation

 class SoftmaxParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      
+      assert(input.tensorDim.dims.count == 2)
+      assert(input.transpose == [0, 1, 2, 3])
+      
+      output.dim = input.dim
+      output.tensorDim = input.tensorDim
+      output.padToFourDim = input.padToFourDim
+    } catch let error {
+      throw error
    }
-    let input: Texture<P>
-    var output: Texture<P>
+  }
+  let input: Texture<P>
+  var output: Texture<P>
 }

 class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
+  typealias OpType = SoftmaxOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
    }
+  }
+  
+  func delogOutput() {
+    print("softmax delog")
+    print(para.input)
    
-    typealias OpType = SoftmaxOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-    }
-    func delogOutput() {
-        print("softmax delog")
-        let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
-        let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
-    }
+    print(para.output)
+    let padToFourDim = para.output.padToFourDim
+    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+    print(outputArray.strideArray())
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class SplitParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = Texture<P>.init(device: input.metalTexture!.device, inDim: input.dim)
+      axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
+      sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
+      if axis < 0 {
+        axis = input.tensorDim.cout() + axis
+      }
+      guard let outlist = opDesc.outputs["Out"] else {
+        fatalError()
+      }
+      for out in outlist {
+        guard let variant = inScope[out], let v = variant as? Texture<P> else {
+          fatalError()
+        }
+        outputList.append(v)
+        sections.append(Int32(v.tensorDim.dims[axis]))
+      }
+    } catch let error {
+      throw error
+    }
+  }
+  
+  var axis: Int
+  let input: Texture<P>
+  var output: Texture<P>
+  var outputList: [Texture<P>] = []
+  var sections: [Int32] = []
+}
+
+class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = SplitOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.input.metalTexture!.device
+    for out in para.outputList {
+      let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
+      print(arr.strideArray())
+    }
+  }
+  
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class TransposeParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let axis: [Int32]
+}
+
+class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = TransposeOp<P>
+
+  func inferShape() {
+    //para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import MetalKit
+import Foundation
+
+@objc public enum Platform: Int{
+  case CPU, GPU
+}
+
+class ScaleKernel: CusomKernel {
+  init(device: MTLDevice, shape: Shape) {
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false)
+    } else {
+      fatalError(" unsupport ")
+    }
+  }
+  
+}
+
+public class Runner: NSObject {
+  var program: Program?
+  var executor: Executor<Float32>?
+  var queue: MTLCommandQueue?
+  var textureLoader: MTKTextureLoader?
+  public let net: Net
+  let device: MTLDevice?
+  let platform: Platform
+  var cpuPaddleMobile: PaddleMobileCPU?
+  let numel: Int
+  let meansNumber: [NSNumber]
+  
+  // dims num nchw
+  let dimsNum: [NSNumber]
+  /**
+   * inNet:        需要运行的网络
+   * commandQueue: GPU 是需要传入
+   * inPlatform:   需要使用的平台, GPU or CPU
+   */
+  @objc public init(inNet: Net, commandQueue: MTLCommandQueue?, inPlatform: Platform) {
+    net = inNet
+    queue = commandQueue
+    device = queue?.device
+    platform = inPlatform
+    if let inDevice = device {
+      textureLoader = MTKTextureLoader.init(device: inDevice)
+    }
+    if platform == .CPU {
+      cpuPaddleMobile = PaddleMobileCPU.init()
+    }
+    numel = net.dim.n * net.dim.c * net.dim.h * net.dim.w
+    meansNumber = net.means.map { NSNumber.init(value: $0) }
+    dimsNum = [NSNumber.init(value: net.dim.n),
+               NSNumber.init(value: net.dim.c),
+               NSNumber.init(value: net.dim.h),
+               NSNumber.init(value: net.dim.w)]
+  }
+  
+  /**
+   * load 模型, 返回 true 可进行预测
+   */
+  @objc public func load() -> Bool {
+    if platform == .GPU {
+      guard let inDevice = device, let inQueue = queue else {
+        print(" paddle mobile gpu load error, need MTLCommandQueue")
+        return false
+      }
+      let loader = Loader<Float32>.init()
+      do {
+        program = try loader.load(device: inDevice, paramPointer: net.paramPointer!, paramSize: net.paramSize,modePointer:net.modelPointer!,modelSize:net.modelSize)
+//        program = try loader.load(device: inDevice, modelPath: net.modelPath, paraPath: net.paramPath)
+        net.updateProgram(program: program!)
+
+        executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!)
+      } catch let error {
+        print(error)
+        return false
+      }
+    } else {
+      return cpuPaddleMobile?.load(net.modelPath, andWeightsPath: net.paramPath) ?? false
+    }
+    return true
+  }
+  
+  @objc public func predict(inputPointer: UnsafeMutablePointer<Float32>, completion: @escaping ( _ success: Bool, _ result: PaddleMobileCPUResult?) -> Void) {
+    
+    guard let res = cpuPaddleMobile?.predictInput(inputPointer, dim: dimsNum) else {
+      completion(false, nil)
+      return
+    }
+    completion(true, res)
+  }
+  
+  /**
+   * GPU 版本 predict
+   * texture: 需要预测的 texture 需要做过预处理
+   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
+   */
+  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: ResultHolder?) -> Void) {
+    do {
+      try self.executor?.predict(input: texture, dim: [self.net.dim.n, self.net.dim.h, self.net.dim.w, self.net.dim.c], completionHandle: { [weak self] (res) in
+        guard let SSelf = self else {
+          fatalError( " self nil " )
+        }
+        let result = SSelf.net.fetchResult(paddleMobileRes: res)
+        completion(true, result)
+      }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
+    } catch let error {
+      print(error)
+      completion(false, nil)
+      return
+    }
+  }
+  
+  /**
+   * CPU GPU 通用版本 predict
+   * cgImage: 需要预测的图片
+   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
+   */
+//  @objc public func predict(cgImage: CGImage, completion: @escaping ( _ success: Bool, _ resultArray: [Float32]) -> Void) {
+//    if platform == .GPU {
+//      getTexture(image: cgImage) { [weak self] (texture) in
+//        guard let SSelf = self else {
+//          fatalError( "" )
+//        }
+//        SSelf.predict(texture: texture, completion: completion)
+//      }
+//    } else if platform == .CPU {
+//      let input = preproccess(image: cgImage)
+//      predict(inputPointer: input, completion: completion)
+//      input.deinitialize(count: numel)
+//      input.deallocate()
+//    }
+//  }
+  
+  /*
+   * 清理内存, 调用此函数后, 不能再使用, 需重新 load
+   */
+  @objc public func clear() {
+    if platform == .GPU {
+      executor?.clear()
+      executor = nil
+      program = nil
+    } else if platform == .CPU {
+      cpuPaddleMobile?.clear()
+    }
+  }
+  
+  @objc public func preproccess(image: CGImage) -> UnsafeMutablePointer<Float> {
+    let output = UnsafeMutablePointer<Float>.allocate(capacity: numel)
+    let means = net.means.map { NSNumber.init(value: $0) }
+    let dims = [NSNumber.init(value: net.dim.n),
+                NSNumber.init(value: net.dim.c),
+                NSNumber.init(value: net.dim.h),
+                NSNumber.init(value: net.dim.w)]
+    cpuPaddleMobile?.preprocess(image, output: output, means: means, scale: net.scale, dim: dims)
+    return output
+  }
+  
+  /*
+   * 获取 texture, 对 texture 进行预处理, GPU 预测时使用
+   */
+  @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
+    let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
+    scaleTexture(input: texture!, complete: getTexture)
+  }
+  
+  public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
+    
+    guard let inQueue = queue, let inDevice = device else {
+      fatalError( " queue or devcie nil " )
+    }
+    
+    guard let buffer = inQueue.makeCommandBuffer() else {
+      fatalError( " make buffer error" )
+    }
+    
+    let scaleKernel = ScaleKernel.init(device: inDevice, shape: CusomKernel.Shape.init(inWidth: net.dim.w, inHeight: net.dim.h, inChannel: 3))
+    
+    do {
+      try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
+    } catch let error {
+      print(error)
+      fatalError()
+    }
+    
+    buffer.addCompletedHandler { (buffer) in
+      complete(scaleKernel.outputTexture)
+    }
+    buffer.commit()
+  }
+}
+
+
--- a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+
+typedef enum : NSUInteger {
+  MobileNetType,
+  MobileNetSSDType,
+  GenetType,
+} NetType;
+
+@interface PaddleMobileGPUResult: NSObject
+
+@property (assign, nonatomic) float *output;
+
+@property (assign, nonatomic) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface ModelConfig: NSObject
+
+/*
+ * 预处理需要用到的值 (三个)
+ */
+@property (strong, nonatomic) NSArray<NSNumber *> *means;
+/*
+ * 预处理需要用到的 scale 值
+ */
+@property (assign, nonatomic) float scale;
+
+/*
+ * 输出维度信息  [n c h w]
+ */
+@property (strong, nonatomic) NSArray<NSNumber *> *dims;
+
+
+/*
+ * 模型参数内存地址
+ */
+@property (assign, nonatomic) void *paramPointer;
+
+/*
+ * 模型参数占用内存大小 (kb)
+ */
+@property (assign, nonatomic) int paramSize;
+
+/*
+ * 模型内存地址
+ */
+@property (assign, nonatomic) void *modelPointer;
+
+/*
+ * 模型占用内存大小 (kb)
+ */
+@property (assign, nonatomic) int modelSize;
+
+@end
+
+@interface PaddleMobileGPU: NSObject
+
+/*
+ * 初始化
+ */
+-(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config;
+
+/*
+ * paramPointer 模型参数内存地址
+ * paramSize    模型参数占用内存大小 (kb)
+ * modelPointer 模型内存地址
+ * modelSize    模型占用内存大小 (kb)
+ */
+-(BOOL)load;
+
+/*
+ * texture:     需要进行预测的图像转换的 texture
+ * completion:  预测完成回调
+ */
+-(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSNumber *> *))completion;
+
+/*
+ * texture:     需要进行预测的图像转换的 texture
+ * completion:  预测完成回调
+ */
+-(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion;
+
+/*
+ * 清理内存
+ */
+-(void)clear;
+
+@end
+
+
--- a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import <Foundation/Foundation.h>
+#import "PaddleMobileGPU.h"
+#import "paddle_mobile.h"
+#import <paddle_mobile/paddle_mobile-Swift.h>
+
+@implementation ModelConfig
+@end
+
+@interface PaddleMobileGPUResult ()
+
+@property (strong, nonatomic) ResultHolder *resultHolder;
+
+- (void)setOutputResult:(ResultHolder *)resultHolder;
+
+@end
+
+@implementation PaddleMobileGPUResult
+- (void)setOutputResult:(ResultHolder *)resultHolder {
+  self.resultHolder = resultHolder;
+  self.output = resultHolder.result;
+  self.outputSize = resultHolder.capacity;
+}
+
+-(void)releaseOutput {
+  [self.resultHolder releasePointer];
+}
+@end
+
+@interface PaddleMobileGPU ()
+{
+  Runner *runner;
+}
+@end
+
+@implementation PaddleMobileGPU
+
+-(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config {
+  self = [super init];
+  if (self) {
+    Net *net = nil;
+    if (netType == GenetType) {
+      net = [[Genet alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
+    } else if (netType == MobileNetSSDType) {
+      net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
+    } else if (netType == MobileNetType) {
+      
+    }
+    runner = [[Runner alloc] initInNet:net commandQueue:queue inPlatform:PlatformGPU];
+  }
+  return self;
+}
+
+-(BOOL)load {
+  return [runner load];
+}
+
+-(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSNumber *> *))completion {
+  [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) {
+    NSMutableArray<NSNumber *> *resultArray = [NSMutableArray arrayWithCapacity:result.capacity];
+    for (int i = 0; i < result.capacity; ++i) {
+      [resultArray addObject:[NSNumber numberWithFloat:result.result[i]]];
+    }
+    completion(success, resultArray);
+    [result releasePointer];
+    
+  }];
+}
+
+-(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion {
+  [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) {
+    PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
+    [gpuResult setOutputResult:result];
+    completion(success, gpuResult);
+  }];
+}
+
+-(void)clear {
+  [runner clear];
+}
+
+@end
--- a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
@@ -14,7 +14,7 @@

 import Foundation

-struct BlockDesc {
+class BlockDesc {
    let index: Int
    let parentIndex: Int
    let vars: [VarDesc]
@@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
    var description: String {
        var str = ""
        
-        for op in ops {
-            str += op.description
+        for i in 0..<ops.count {
+          str += " op \(i): "
+          let op = ops[i]
+          str += op.description
        }
        
        for varDesc in vars {

--- a/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
@@ -14,68 +14,68 @@

 import Foundation

-struct OpDesc {
-    let inputs: [String : [String]]
-    var paraInputs: [String : [String]]
-    var outputs: [String : [String]]
-    let unusedOutputs: [String : [String]]
-    var attrs: [String : Attr] = [:]
-    var type: String
-    init(protoOpDesc: PaddleMobile_Framework_Proto_OpDesc) {
-        type = protoOpDesc.type
-        let creator = { (vars: [PaddleMobile_Framework_Proto_OpDesc.Var], canAdd: (String) -> Bool) -> [String : [String]] in
-            var map: [String : [String]] = [:]
-            for opDescVar  in vars {
-                if (canAdd(opDescVar.parameter)) {
-                    map[opDescVar.parameter] = opDescVar.arguments
-                }
-            }
-            return map
-        }
-        
-        inputs = creator(protoOpDesc.inputs) {
-            opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false
-        }
-        
-        paraInputs = creator(protoOpDesc.inputs) {
-            !(opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false)
-        }
-        
-        outputs = creator(protoOpDesc.outputs) {
-            opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false
-        }
-        
-        unusedOutputs = creator(protoOpDesc.outputs) {
-            !(opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false)
-        }
-        
-        for attr in protoOpDesc.attrs {
-            if (attr.type != .block) {
-                attrs[attr.name] = attrWithProtoDesc(attrDesc: attr)
-            }
+class OpDesc {
+  let inputs: [String : [String]]
+  var paraInputs: [String : [String]]
+  var outputs: [String : [String]]
+  let unusedOutputs: [String : [String]]
+  var attrs: [String : Attr] = [:]
+  var type: String
+  init(protoOpDesc: PaddleMobile_Framework_Proto_OpDesc) {
+    type = protoOpDesc.type
+    let creator = { (vars: [PaddleMobile_Framework_Proto_OpDesc.Var], canAdd: (String) -> Bool) -> [String : [String]] in
+      var map: [String : [String]] = [:]
+      for opDescVar  in vars {
+        if (canAdd(opDescVar.parameter)) {
+          map[opDescVar.parameter] = opDescVar.arguments
        }
+      }
+      return map
    }
-}
-
-extension OpDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    var description: String {
-        var str = ""
-        str += "op type: \(type): \n"
-        str += "    op inputs: \n"
-        str += "        \(inputs) \n"
-        str += "    op para inputs: \n"
-        str += "        \(paraInputs) \n"
-        str += "    op para outputs: \n"
-        str += "        \(outputs) \n"
-        str += "    op attrs: \n"
-        str += "        \(attrs) \n"
-        
-        return str
+    
+    inputs = creator(protoOpDesc.inputs) {
+      opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false
    }
    
-    var debugDescription: String {
-        return description
+    paraInputs = creator(protoOpDesc.inputs) {
+      !(opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false)
    }
    
+    outputs = creator(protoOpDesc.outputs) {
+      opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false
+    }
+    
+    unusedOutputs = creator(protoOpDesc.outputs) {
+      !(opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false)
+    }
+    
+    for attr in protoOpDesc.attrs {
+      if (attr.type != .block) {
+        attrs[attr.name] = attrWithProtoDesc(attrDesc: attr)
+      }
+    }
+  }
+}
+
+extension OpDesc: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    var str = ""
+    str += "op type: \(type): \n"
+    str += "    op inputs: \n"
+    str += "        \(inputs) \n"
+    str += "    op para inputs: \n"
+    str += "        \(paraInputs) \n"
+    str += "    op para outputs: \n"
+    str += "        \(outputs) \n"
+    str += "    op attrs: \n"
+    str += "        \(attrs) \n"
    
+    return str
+  }
+  
+  var debugDescription: String {
+    return description
+  }
+  
+  
 }
--- a/metal/paddle-mobile/paddle-mobile/Program/Program.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/Program.swift
@@ -14,7 +14,7 @@

 import Foundation

-public struct Program {
+public class Program {
    let paramPath: String
    let programDesc: ProgramDesc
    let scope: Scope
@@ -23,4 +23,9 @@ public struct Program {
        paramPath = inParamPath
        scope = inScope
    }
+    init(inProgramDesc: ProgramDesc, inScope: Scope) {
+        programDesc = inProgramDesc
+        scope = inScope
+        paramPath = ""
+    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
@@ -14,7 +14,7 @@

 import Foundation

-public struct ProgramDesc {
+public class ProgramDesc {
    var blocks: [BlockDesc] = []
    init(protoProgram: PaddleMobile_Framework_Proto_ProgramDesc) {
        for block in protoProgram.blocks {

--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
@@ -15,204 +15,285 @@
 import Foundation

 precedencegroup ChainNode {
-    associativity: left
-    higherThan: MultiplicationPrecedence
+  associativity: left
+  higherThan: MultiplicationPrecedence
 }

 infix operator --> : ChainNode

 class Node {
-    var inputs: [Node] = []
-    var outputs: [Node] = []
-    var type: String
-    var opDesc: OpDesc?
-    init(inOpDesc: OpDesc) {
-        type = inOpDesc.type
-        opDesc = inOpDesc
+  var inputs: [Node] = []
+  var outputs: [Node] = []
+  var type: String
+  var opDesc: OpDesc?
+  init(inOpDesc: OpDesc) {
+    type = inOpDesc.type
+    opDesc = inOpDesc
+  }
+  
+  init(inType: String) {
+    type = inType
+  }
+  
+  subscript(index: Int) -> [Node] {
+    var nodes: [Node] = []
+    getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
+    return nodes
+  }
+  
+  func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
+    if index == nowIndex {
+      nodes.append(self)
    }
    
-    init(inType: String) {
-        type = inType
+    for output in outputs {
+      output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+    }
+  }
+  
+  static func -->(lNode: Node, rNode: Node) -> Node {
+    lNode.outputs.append(rNode)
+    rNode.inputs.append(lNode)
+    return rNode
+  }
+  
+  func depth(begin: UInt = 1) -> UInt {
+    var beginMax: UInt = 1
+    for output in outputs {
+      let subDepth = output.depth(begin: begin + 1)
+      beginMax = max(begin, subDepth)
+    }
+    beginMax = max(begin, beginMax)
+    return beginMax
+  }
+  
+  func to(depth: UInt) -> Node {
+    let beginNode = Node.init(inType: type)
+    beginNode.opDesc = opDesc
+    to(depth: depth - 1, withNode: beginNode)
+    return beginNode
+  }
+  
+  func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
+    let fusionNode = fusion.fusionNode()
+    let change = fusion.change()
+    let inOutputs = outputs
+    outputs.removeAll()
+    opDesc?.outputs.removeAll()
+    for i in 0..<inOutputs.count {
+      inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+    }
+    opDesc?.type = fusion.fusionType()
+    type = fusion.fusionType()
+  }
+  
+  private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
+    guard let inOpdesc = opDesc else {
+      fatalError()
    }
    
-    static func -->(lNode: Node, rNode: Node) -> Node {
-        lNode.outputs.append(rNode)
-        rNode.inputs.append(lNode)
-        return rNode
+    for attr in inOpdesc.attrs {
+      beginNode.opDesc?.attrs[attr.key] = attr.value
+      //            print(beginNode.opDesc?.attrs)
    }
    
-    func depth(begin: UInt = 1) -> UInt {
-        var beginMax: UInt = 1
-        for output in outputs {
-            let subDepth = output.depth(begin: begin + 1)
-            beginMax = max(begin, subDepth)
+    for paraInput in inOpdesc.paraInputs {
+      if let inChanges = change[type] {
+        for keyChange in inChanges {
+          if keyChange.from == paraInput.key {
+            beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
+          } else {
+            beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+          }
        }
-        beginMax = max(begin, beginMax)
-        return beginMax
+      } else {
+        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+      }
    }
    
-    func to(depth: UInt) -> Node {
-        let beginNode = Node.init(inType: type)
-        to(depth: depth - 1, withNode: beginNode)
-        return beginNode
+    if matchNode.outputs.count == 0 {
+      beginNode.outputs.append(contentsOf: outputs)
+      beginNode.opDesc?.outputs = inOpdesc.outputs
+      
    }
+    removedNodes.append(self)
    
-    func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
-        let fusionNode = fusion.fusionNode()
-        let change = fusion.change()
-        let inOutputs = outputs
-        outputs.removeAll()
-        opDesc?.outputs.removeAll()
-        for i in 0..<inOutputs.count {
-            inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        opDesc?.type = fusion.fusionType()
-        type = fusion.fusionType()
+    for i in 0..<matchNode.outputs.count {
+      outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
    }
    
-    private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
-        guard let inOpdesc = opDesc else {
-            fatalError()
-        }
-        
-        for attr in inOpdesc.attrs {
-            beginNode.opDesc?.attrs[attr.key] = attr.value
-//            print(beginNode.opDesc?.attrs)
-        }
-        
-        for paraInput in inOpdesc.paraInputs {
-            if let inChanges = change[type] {
-                for keyChange in inChanges {
-                    if keyChange.from == paraInput.key {
-                        beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
-                    } else {
-                        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-                    }
-                }
-            } else {
-                beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-            }
-        }
-        
-        if matchNode.outputs.count == 0 {
-            beginNode.outputs.append(contentsOf: outputs)
-            beginNode.opDesc?.outputs = inOpdesc.outputs
-            
-        }
-        removedNodes.append(self)
-        
-        for i in 0..<matchNode.outputs.count {
-            outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        
+  }
+  
+  private func to(depth: UInt, withNode: Node) {
+    if depth < 1 {
+      return
    }
    
-    private func to(depth: UInt, withNode: Node) {
-        if depth < 1 {
-            return
-        }
-        
-        for output in outputs {
-            let node = Node.init(inType: output.type)
-            withNode.outputs.append(node)
-            output.to(depth: depth - 1, withNode: node)
-        }
+    for output in outputs {
+      let node = Node.init(inType: output.type)
+      node.opDesc = output.opDesc
+      withNode.outputs.append(node)
+      output.to(depth: depth - 1, withNode: node)
+    }
+  }
+  
+  func relationship() -> [String : Node]{
+    var map: [String : Node] = [:]
+    relationship(map: &map)
+    return map
+  }
+  
+  private func relationship(map: inout [String : Node]) {
+    guard let inOpDesc = opDesc else {
+      return
    }
    
+    for output in inOpDesc.outputs {
+      for outputKey in output.value {
+        map[outputKey] = self
+      }
+    }
    
+    for output in outputs {
+      output.relationship(map: &map)
+    }
+  }
+  
 }

 extension Node: Equatable {
-    static func == (lhs: Node, rhs: Node) -> Bool {
-        if lhs.outputs.count != rhs.outputs.count {
-            return false
-        }
-        
-        if lhs.type != rhs.type {
-            return false
-        }
-        
-        for i in 0..<lhs.outputs.count {
-            if lhs.outputs[i] != rhs.outputs[i] {
-                return false
-            }
-        }
-        return true
+  static func == (lhs: Node, rhs: Node) -> Bool {
+    if lhs.outputs.count != rhs.outputs.count {
+      return false
    }
    
+    if lhs.type != rhs.type {
+      return false
+    }
+    
+    for i in 0..<lhs.outputs.count {
+      if lhs.outputs[i] != rhs.outputs[i] {
+        return false
+      }
+    }
+    return true
+  }
+  
 }

 class ProgramOptimize<P: PrecisionType> {
-    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self, ConvAddOp<P>.self]
-    func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
-        
-        guard originProgramDesc.blocks.count == 1 else {
-            fatalError(" not support yet")
+  // register fusion
+  let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+//                                  ConvAddAddPreluOp<P>.self,
+                                  ConvAddPreluOp<P>.self,
+                                  ConvAddOp<P>.self,
+                                  ConvBNReluOp<P>.self,
+                                  DwConvBNReluOp<P>.self,
+                                  ElementwiseAddPreluOp<P>.self
+  ]
+  
+  func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
+    
+    guard originProgramDesc.blocks.count == 1 else {
+      fatalError(" not support yet")
+    }
+    
+    var mapForNodeChain: [String : Node] = [:]
+    var nodes: [Node] = []
+    var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
+    let block = originProgramDesc.blocks[0]
+    for opDesc in block.ops {
+      guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
+        fatalError()
+      }
+      
+      let node = Node.init(inOpDesc: opDesc)
+      for inputKey in opInputKeys {
+        if let inputs = opDesc.inputs[inputKey] {
+          for input in inputs {
+            if let inputNode = mapForNodeChain[input] {
+              _ = inputNode --> node
+            }
+          }
        }
-        
-        var mapForNodeChain: [String : Node] = [:]
-        var nodes: [Node] = []
-        var typeMapNodes: [String : [Node]] = [:]
-        let block = originProgramDesc.blocks[0]
-            for opDesc in block.ops {
-                guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
-                    fatalError()
-                }
-                
-                let node = Node.init(inOpDesc: opDesc)
-                for inputKey in opInputKeys {
-                    if let inputs = opDesc.inputs[inputKey] {
-                        for input in inputs {
-                            if let inputNode = mapForNodeChain[input] {
-                                _ = inputNode --> node
-                            }
-                        }
+      }
+      
+      for outputKey in outputKeys {
+        if let outputs = opDesc.outputs[outputKey] {
+          for output in outputs {
+            mapForNodeChain[output] = node
+          }
+        }
+      }
+      
+      nodes.append(node)
+      
+      if var inNodes = typeMapNodes[opDesc.type] {
+        inNodes.append((node, mapForNodeChain))
+        typeMapNodes[opDesc.type] = inNodes
+      } else {
+        typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
+      }
+    }
+    
+    for fusion in fusionOps {
+      let fusionNode = fusion.fusionNode()
+      let depth = fusionNode.depth()
+      if let toMatchNodes = typeMapNodes[fusionNode.type] {
+        for node in toMatchNodes {
+          
+          let toNode = node.node.to(depth: depth)
+          if toNode == fusionNode {   // match
+            var canFolder = true
+            let relationshipMap = toNode.relationship()
+            
+            for toCheck in fusion.needCheck() {
+              //              let nodes = toCheck
+              let checkNodes = toNode[toCheck.0]
+              
+              for checkNode in checkNodes {
+                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
+                for inputToCheck in inputToChecks {
+                  if node.output[inputToCheck] == nil {
+                    if relationshipMap[inputToCheck] == nil {
+                      canFolder = false
                    }
+                  }
                }
                
-                for outputKey in outputKeys {
-                    if let outputs = opDesc.outputs[outputKey] {
-                        for output in outputs {
-                            mapForNodeChain[output] = node
-                        }
+                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
+                for paramInputToCheck in paramInputToChecks {
+                  if node.output[paramInputToCheck] == nil {
+                    if relationshipMap[paramInputToCheck] == nil {
+                      canFolder = false
                    }
+                  }
                }
-                
-                nodes.append(node)
-                
-                if var inNodes = typeMapNodes[opDesc.type] {
-                    inNodes.append(node)
-                    typeMapNodes[opDesc.type] = inNodes
-                } else {
-                    typeMapNodes[opDesc.type] = [node]
-                }
+              }
            }
            
-            for fusion in fusionOps {
-                let fusionNode = fusion.fusionNode()
-                let depth = fusionNode.depth()
-                if let toMatchNodes = typeMapNodes[fusionNode.type] {
-                    for node in toMatchNodes {
-                        let toNode = node.to(depth: depth)
-                        if toNode == fusionNode {   // match
-                            var removeNodes: [Node] = []
-                            node.folderWith(fusion: fusion, removedNodes: &removeNodes)
-                            for removeNode in removeNodes {
-                                nodes.remove(element: removeNode)
-                            }
-                        }
-                    }
-                }
+            if !canFolder {
+              continue
            }
-        
-        var ops: [OpDesc] = []
-        for node in nodes {
-            ops.append(node.opDesc!)
+            
+            var removeNodes: [Node] = []
+            node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
+            for removeNode in removeNodes {
+              nodes.remove(element: removeNode)
+            }
+          }
        }
-        
-        var newProgramDesc = ProgramDesc.init()
-        let newBlock = BlockDesc.init(inVars: block.vars, inOps: ops)
-        newProgramDesc.blocks.append(newBlock)
-        return newProgramDesc
+      }
    }
+    
+    var ops: [OpDesc] = []
+    for node in nodes {
+      ops.append(node.opDesc!)
+    }
+    
+    var newProgramDesc = ProgramDesc.init()
+    let newBlock = BlockDesc.init(inVars: block.vars, inOps: ops)
+    newProgramDesc.blocks.append(newBlock)
+    return newProgramDesc
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
@@ -14,18 +14,18 @@

 import Foundation

-struct TensorDesc {
+class TensorDesc {
    let dims: [Int]
    let dataType: VarTypeType
-    let dataLayout: DataLayout = .NCHW
+    let dataLayout: DataLayout = DataLayout.NCHW()
    var NCHWDim: [Int] {
        get {
            if dims.count != 4 {
                return dims
            }
-            if dataLayout == .NCHW {
+            if dataLayout == DataLayout.NCHW() {
                return dims
-            } else if dataLayout == .NHWC{
+            } else if dataLayout == DataLayout.NHWC() {
                var resultDims = dims
                resultDims.swapAt(1, 3)
                return resultDims
@@ -40,9 +40,9 @@ struct TensorDesc {
            if dims.count != 4 {
                return dims
            }
-            if dataLayout == .NHWC {
+            if dataLayout == DataLayout.NHWC() {
                return dims
-            } else if dataLayout == .NCHW{
+            } else if dataLayout == DataLayout.NCHW() {
                var resultDims = dims
                resultDims.swapAt(1, 3)
                return resultDims
@@ -53,7 +53,7 @@ struct TensorDesc {
    }
    
    init(protoTensorDesc: PaddleMobile_Framework_Proto_VarType.TensorDesc) {
-        dims = protoTensorDesc.dims.map{ Int($0) > 0 ? Int($0) : 1 }
+        dims = protoTensorDesc.dims.map{ Int($0) > 0 ? Int($0) : abs(Int($0)) }
        dataType = VarTypeType.init(rawValue: protoTensorDesc.dataType.rawValue) ?? .ErrorType
    }
    

--- a/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
@@ -56,7 +56,7 @@ enum VarTypeType: Int {
    }
 }

-struct VarDesc {
+class VarDesc {
    let name: String
    let persistable: Bool
    let type: VarTypeType

--- a/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
@@ -31,15 +31,14 @@ public struct Dim {
        return dims.reduce(1) { $0 * $1 }
    }
    
-    static func ==(left: Dim, right: Dim) -> Bool {
+    public static func ==(left: Dim, right: Dim) -> Bool {
        return left.dims == right.dims;
    }
    
-    subscript(index: Int) -> Int {
+    public subscript(index: Int) -> Int {
        return dims[index];
    }
    
-    
    private(set) var dims: [Int]
    private init(){
        fatalError()

--- a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+let testTo = 81
+
+var isTest = false
+
+let computePrecision: ComputePrecision = .Float16
+
+public class GPUResultHolder {
+  public let dim: [Int]
+  public let capacity: Int
+  public var resultPointer: UnsafeMutablePointer<Float32>?
+  public var intermediateResults: [String : [Variant]]?
+  public let elapsedTime: Double
+  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
+    dim = inDim
+    capacity = inCapacity
+    
+    if let inInPointer = inPointer {
+      resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
+      resultPointer?.initialize(from: inInPointer, count: inCapacity)
+    }
+    
+    elapsedTime = inElapsedTime
+    intermediateResults = inIntermediateResults
+  }
+  
+}
+
+extension GPUResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
+  public var debugDescription: String {
+//    var str = ""
+//    str += "Dim: \(dim) \n value:[ "
+//    if resultArr.count < 20 {
+//      for d in resultArr {
+//        str += " \(d) "
+//      }
+//    } else {
+//      for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
+//        str += " \(resultArr[d]) "
+//      }
+//    }
+//    str += " ]"
+//    return str
+    fatalError()
+  }
+  
+  public var description: String {
+    return debugDescription
+  }
+}
+
+public class Executor<P: PrecisionType> {
+  var ops: [Runable & InferShaperable] = []
+  let program: Program
+  let device: MTLDevice
+  let inflightSemaphore: DispatchSemaphore
+  let queue: MTLCommandQueue
+  public init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program) throws {
+    self.inflightSemaphore = DispatchSemaphore(value: 3)
+    program = inProgram
+    device = inDevice
+    queue = inQueue
+//    print("before for ")
+//print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+    
+    
+    for block in inProgram.programDesc.blocks {
+      //block.ops.count
+      for i in 0..<block.ops.count {
+        let opDesc = block.ops[i]
+        do {
+//          print("in for i \(i): ")
+//      print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+//
+//          if i == 56 {
+//          print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+//
+//          }
+          
+          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope)
+          ops.append(op)
+        } catch let error {
+          throw error
+        }
+      }
+    }
+  }
+  
+  public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (GPUResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+    guard let buffer = queue.makeCommandBuffer() else {
+      throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
+    }
+    inflightSemaphore.wait()
+    
+    let resInput: MTLTexture
+    if let inPre = preProcessKernle {
+      do {
+        try inPre.compute(inputTexuture: input, commandBuffer: buffer)
+        resInput = inPre.outputTexture
+      } catch let error {
+        throw error
+      }
+    } else {
+      resInput = input
+    }
+    
+    let beforeDate = Date.init()
+    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: dim))
+    program.scope.setInput(input: inputTexture)
+    //(ops.count - except)
+    for i in 0..<(ops.count - except) {
+      let op = ops[i]
+      do {
+        try op.run(device: device, buffer: buffer)
+      } catch let error {
+        throw error
+      }
+    }
+    
+    var outputTextures: [String : [Variant]]?
+    if except > 0 {
+      ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
+      outputTextures = ops[ops.count - except].inputVariant()
+    }
+    
+    buffer.addCompletedHandler { [weak self] (commandbuffer) in
+//      let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+//      print(inputArr.strideArray())
+//
+////      print(dim)
+//      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
+//      print(" write done ")
+
+//      print("write to library done")
+//      return
+//                  print(inputArr)
+//
+//                  let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
+//                  print(stridableInput)
+//
+//                  let _: Flo? = input.logDesc(header: "input: ", stridable: true)
+//      for i in 0..<self!.ops.count {
+//        let op = self!.ops[i]
+//        print(" 第 \(i) 个 op: ")
+//        op.delogOutput()
+//      }
+      
+//      return;
+//      self!.ops[testTo - 2].delogOutput()
+//      self!.ops[testTo - 1].delogOutput()
+//      self!.ops[5].delogOutput()
+
+//      return
+      
+      guard let SSelf = self else {
+//        return
+        fatalError()
+      }
+      
+      let afterDate = Date.init()
+      var resultHolder: GPUResultHolder
+      if except > 0 {
+        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
+      } else {
+        let outputVar: Variant = SSelf.program.scope.output()!
+        let output: FetchHolder = outputVar as! FetchHolder
+//        let beforeToTensorDate = Date.init()
+
+        resultHolder = GPUResultHolder.init(inDim: output.dim, inPointer: output.result, inCapacity: output.capacity, inElapsedTime: afterDate.timeIntervalSince(beforeDate))
+        
+//        let timeToTensor = Date.init().timeIntervalSince(beforeToTensorDate)
+//        print(timeToTensor)
+      }
+
+      completionHandle(resultHolder)
+      SSelf.inflightSemaphore.signal()
+    }
+    buffer.commit()
+  }
+  
+  public func clear() {
+    program.scope.clear()
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+import SwiftProtobuf
+
+public class Loader<P: PrecisionType> {
+  class ParaLoader {
+    let file: UnsafeMutablePointer<FILE>
+    let fileSize: Int
+    var nowIndex: Int
+    init(paramPath: String) throws {
+      guard let tmpFile = fopen(paramPath, "rb") else {
+        throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+      }
+      file = tmpFile
+      fseek(file, 0, SEEK_END)
+      fileSize = ftell(file)
+      guard fileSize > 0 else {
+        throw PaddleMobileError.loaderError(message: "param file size is too small")
+      }
+      rewind(file)
+      nowIndex = 0
+    }
+    
+    func read(tensor: Tensor<P>) throws {
+      guard nowIndex <= fileSize else {
+        throw PaddleMobileError.loaderError(message: "out of the file range")
+      }
+      
+      func pointerReader<T>(type: T.Type) -> T {
+        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+        fread(ptr, 1, MemoryLayout<T>.size, file)
+        nowIndex += MemoryLayout<T>.size
+        let pointee = ptr.pointee
+        ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+        ptr.deallocate()
+        return pointee
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      let lodLevel = pointerReader(type: UInt64.self)
+      for _ in 0..<lodLevel {
+        let size = pointerReader(type: UInt64.self)
+        for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+          _ = pointerReader(type: size_t.self)
+        }
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      
+      let tensorDescSize = pointerReader(type: Int32.self)
+      
+      fseek(file, Int(tensorDescSize), SEEK_CUR)
+      nowIndex += Int(tensorDescSize)
+      
+      /*
+       这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
+       */
+      
+      //现在模型传入模型为  Float 类型, 这块应该根据模型来
+      //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+      //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+      let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
+      
+      guard bytesRead == tensor.data.size else {
+        throw PaddleMobileError.loaderError(message: "param read size error")
+      }
+      
+      // TODO: use script to convert
+      //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+      //            for i in 0..<tensor.numel() {
+      //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+      //            }
+      //            tmpPointer.deinitialize(count: tmpCapacity)
+      //            tmpPointer.deallocate()
+      
+      nowIndex += bytesRead
+    }
+    
+    deinit {
+      fclose(file)
+    }
+  }
+  class ParaLoaderWithPointer {
+    var paramPointer: UnsafeMutableRawPointer
+      let paramSize: Int
+      var nowIndex: Int
+      init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
+          paramPointer = UnsafeMutableRawPointer.init(pPointer)
+          paramSize = pSize
+          nowIndex = 0
+      }
+    
+      func read(tensor: Tensor<P>) throws {
+        guard nowIndex <= paramSize else {
+          throw PaddleMobileError.loaderError(message: "out of the file range")
+        }
+        var readerIndex: Int = 0
+        func pointerReader<T>(type: T.Type) -> T {
+          let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+          memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout<T>.size)
+          nowIndex += MemoryLayout<T>.size
+          readerIndex += MemoryLayout<T>.size
+          let pointee = ptr.pointee
+          ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+          ptr.deallocate()
+          
+          return pointee
+        }
+        let _ = pointerReader(type: UInt32.self)
+        let lodLevel = pointerReader(type: UInt64.self)
+        for _ in 0..<lodLevel {
+          let size = pointerReader(type: UInt64.self)
+          for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+            _ = pointerReader(type: size_t.self)
+          }
+        }
+        
+        let _ = pointerReader(type: UInt32.self)
+        let tensorDescSize = pointerReader(type: Int32.self)
+        
+        paramPointer = paramPointer.advanced(by: Int(readerIndex))
+        paramPointer = paramPointer.advanced(by: Int(tensorDescSize))
+        nowIndex += Int(tensorDescSize)
+        
+        let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size)
+        paramPointer = paramPointer.advanced(by: Int(tensor.data.size))
+        nowIndex += tensor.data.size
+    }
+    deinit {
+    }
+  }
+  public init(){}
+  func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program {
+    do {
+      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
+        serializedData: modelData)
+      
+      let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
+      let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
+      print(programDesc)
+      
+      guard programDesc.blocks.count > 0 else {
+        throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
+      }
+      
+      // to get feed key and fetch key
+      let block = programDesc.blocks[0]
+      guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
+        throw PaddleMobileError.loaderError(message: "at least two operator")
+      }
+      
+      guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
+        throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
+      }
+      
+      guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
+        throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
+      }
+      guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
+        throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
+      }
+      
+      let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
+      
+      // to load memory
+      for block in programDesc.blocks {
+        for varDesc in block.vars {
+          if (varDesc.type == .LodTensor) {
+            guard let tensorDesc = varDesc.tensorDesc else {
+              throw PaddleMobileError.loaderError(message: "get tensor desc failed")
+            }
+            
+            if (varDesc.persistable
+              && varDesc.type != .FeedMiniBatch
+              && varDesc.type != .FetchList) {
+              let dimArr = tensorDesc.dims
+              
+              guard dimArr.count > 0 else {
+                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
+              }
+              
+              let dim = Dim.init(inDim: dimArr)
+              let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+              do {
+                if paraLoaderPointer != nil {
+                  try paraLoaderPointer!.read(tensor: tensor)
+                }
+                
+                if paraLoader != nil {
+                  try paraLoader!.read(tensor: tensor)
+                }
+              } catch let error {
+                throw error
+              }
+              //              tensor.convert(to: DataLayout.NHWC())
+              //                            tensor.initBuffer(device: device)
+              scope[varDesc.name] = tensor
+            } else {
+              let dim = Dim.init(inDim: tensorDesc.dims)
+              scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
+            }
+          } else {
+            if varDesc.name == fetchKey {
+//              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
+            } else if varDesc.name == feedKey {
+            }
+          }
+        }
+      }
+      
+      let program = Program.init(inProgramDesc: programDesc, inScope: scope)
+      
+      return program
+    } catch _ {
+      throw PaddleMobileError.loaderError(message: "protobuf decoder error")
+    }
+  }
+  public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program {
+    let modelData = Data.init(bytes:modePointer, count:modelSize)
+    guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
+      throw PaddleMobileError.loaderError(message: "load para error")
+    }
+    do {
+      let program = try loadModelandParam(device,modelData,paraLoader,nil)
+      return program
+    } catch let error {
+      throw error
+    }
+  }
+    
+  public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
+    guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
+      throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+    }
+    guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
+      throw PaddleMobileError.loaderError(message: "load para error")
+    }
+    
+    do {
+      let program = try loadModelandParam(device,modelData,nil,paraLoader)
+      return program
+    } catch let error {
+      throw error
+    }
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -12,251 +12,308 @@
 See the License for the specific language governing permissions and
 limitations under the License. */

-import Accelerate
 import Foundation

 protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
-    var dim: Dim { get set }
-    func numel() -> Int
-    var layout: DataLayout { get }
+  var dim: Dim { get set }
+  func numel() -> Int
+  var layout: DataLayout { get }
 }

 extension Tensorial {
-    func numel() -> Int {
-        return dim.numel()
-    }
+  func numel() -> Int {
+    return dim.numel()
+  }
+}
+
+public enum ComputePrecision {
+  case Float32, Float16
 }

 class Tensor<P: PrecisionType>: Tensorial {
-    enum BufferPrecision {
-        case Float32, Float16
+  
+  var data: Data
+  var dim: Dim
+  var buffer: MTLBuffer!
+  private(set) var layout: DataLayout
+  
+  class Data {
+    init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
+      size = inSize
+      pointer = inPointer
+    }
+    let size: Int
+    var pointer: UnsafeMutablePointer<P>
+    subscript(index: Int) -> P{
+      get {
+        return pointer[index]
+      }
+      set {
+        pointer[index] = newValue
+      }
+    }
+    func release() {
+      pointer.deinitialize(count: size)
+      pointer.deallocate()
+    }
+    deinit {
+      //            release()
+    }
+  }
+  
+  required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+    dim = inDim
+    let size = inDim.numel() * MemoryLayout<P>.size
+    let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
+    data = Data.init(inSize: size, inPointer: pointer)
+    layout = inLayout
+  }
+  
+  func convert(to: DataLayout) {
+    guard to != layout else {
+      return
    }
    
-    var data: Data
-    var dim: Dim
-    var buffer: MTLBuffer!
-    private(set) var layout: DataLayout
+    guard dim.cout() == 4 else {
+      return
+    }
    
-    class Data {
-        init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
-            size = inSize
-            pointer = inPointer
-        }
-        let size: Int
-        var pointer: UnsafeMutablePointer<P>
-        subscript(index: Int) -> P{
-            get {
-                return pointer[index]
-            }
-            set {
-                pointer[index] = newValue
-            }
-        }
-        func release() {
-            pointer.deinitialize(count: size)
-            pointer.deallocate()
-        }
-        deinit {
-//            release()
-        }
+    guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
+      // other not support
+      return
    }
- 
-    required init(inDim: Dim, inLayout: DataLayout = .NCHW) {
-        dim = inDim
-        let size = inDim.numel() * MemoryLayout<P>.size
-        let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
-        data = Data.init(inSize: size, inPointer: pointer)
-        layout = inLayout
+    let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
+    
+    if layout == DataLayout.NCHW() {
+      NCHW2NHWC(newPtr: newPointer)
    }
    
-    func convert(to: DataLayout) {
-        guard to != layout else {
-            return
-        }
-        
-        guard dim.cout() == 4 else {
-            return
-        }
-        
-        guard layout == .NCHW && to == .NHWC else {
-            // other not support
-            return
-        }
-        let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
-        
-        if layout == .NCHW {
-            NCHW2NHWC(newPtr: newPointer)
-        }
-        
-        data.release()
-        data.pointer = newPointer
-        layout = to
+    data.release()
+    data.pointer = newPointer
+    layout = to
+  }
+  
+
+  
+  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) {
+    if convertToNHWC {
+//      print(layout)
+      convert(to: DataLayout.NHWC())
    }
    
-    func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
-        var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
-        var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
-        guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
-            fatalError(" float 32 to float 16 error ! ")
+    if withTranspose {
+      let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
+      let n = dim[0]
+      let hwc = numel()/n
+      for j in 0..<hwc {
+        for i in 0..<n {
+          //data[i * hwc + j]
+          transposePointer[j * n + i] = data[i * hwc + j]
        }
+      }
+
+      dim.swapeDimAt(index1: 0, index2: 3)
+      data.release()
+      data.pointer = transposePointer
    }
    
-    func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
-        guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
-            fatalError(" not support yet ")
-        }
-        
-        
-        let precisionSize: Int
-        switch precision {
-        case .Float32:
-            precisionSize = 4
-        case .Float16:
-            precisionSize = 2
-        }
-        
-        if dim.cout() == 4 {
-            if layout == .NHWC {
-                let C = dim[3]
-                let cSlices = (C + 3) / 4
-                let paddedC = cSlices * 4
-                let count = paddedC * dim[0] * dim[1] * dim[2]
-                if C == paddedC {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
-                    }
-                } else if C == 1 {
-                    buffer = device.makeBuffer(length: numel() * precisionSize)
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-                    }
-                } else {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
-                    var tmpPointer = floatPointer
-                    var dstPtr = convertedPointer
-                    for _ in 0..<dim[0] * dim[1] * dim[2] {
-                        for j in 0..<paddedC {
-                            if j < C {
-                                dstPtr[j] = tmpPointer[j]
-                            }
-                        }
-                        tmpPointer += C
-                        dstPtr += paddedC
-                    }
-                    
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
-                    }
-                    
-                    convertedPointer.deinitialize(count: count)
-                    convertedPointer.deallocate()
-                }
-            }
-        } else if dim.cout() == 1 {
-            buffer = device.makeBuffer(length: numel() * precisionSize)
-            switch precision {
-            case .Float32:
-                buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-            case .Float16:
-                float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-            }
-        } else {
-            fatalError(" not support !")
-        }
-        //TODO: release
-        data.release()
+    guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
+      fatalError(" not support yet ")
    }
    
-    var width: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[1]
-            } else {
-                fatalError()
-            }
-        }
+    let precisionSize: Int
+    switch precision {
+    case .Float32:
+      precisionSize = 4
+    case .Float16:
+      precisionSize = 2
    }
    
-    var height: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[2]
-            } else {
-                fatalError()
+    if dim.cout() == 4 {
+      if layout == DataLayout.NHWC() {
+        let C = dim[3]
+        let cSlices = (C + 3) / 4
+        let paddedC = cSlices * 4
+        let count = paddedC * dim[0] * dim[1] * dim[2]
+        if C == paddedC {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+          }
+        } else if C == 1 {
+          buffer = device.makeBuffer(length: numel() * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+          }
+        } else {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+          var tmpPointer = floatPointer
+          var dstPtr = convertedPointer
+          for _ in 0..<dim[0] * dim[1] * dim[2] {
+            for j in 0..<paddedC {
+              if j < C {
+                dstPtr[j] = tmpPointer[j]
+              } else {
+                dstPtr[j] = 0
+              }
            }
+            tmpPointer += C
+            dstPtr += paddedC
+          }
+          
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+          }
+          
+          convertedPointer.deinitialize(count: count)
+          convertedPointer.deallocate()
        }
-    }
-    
-    var channel: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[3]
-            } else {
-                fatalError()
+      } else {
+        let C = dim[3]
+        let cSlices = (C + 3) / 4
+        let paddedC = cSlices * 4
+        let count = paddedC * dim[0] * dim[1] * dim[2]
+        if C == paddedC {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+          }
+        } else if C == 1 {
+          fatalError(" not support ")
+        } else {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+          var tmpPointer = floatPointer
+          var dstPtr = convertedPointer
+          for _ in 0..<dim[0] * dim[1] * dim[2] {
+            for j in 0..<paddedC {
+              if j < C {
+                dstPtr[j] = tmpPointer[j]
+              } else {
+                dstPtr[j] = 0
+              }
            }
+            tmpPointer += C
+            dstPtr += paddedC
+          }
+          
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+          }
+          convertedPointer.deinitialize(count: count)
+          convertedPointer.deallocate()
        }
+      }
+    } else if dim.cout() == 1 {
+      let num = ((numel() + 3) / 4) * 4
+      buffer = device.makeBuffer(length: num * precisionSize)
+      switch precision {
+      case .Float32:
+        buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
+      case .Float16:
+        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
+      }
+    } else {
+      fatalError(" not support !")
    }
-
+    //TODO: release
+    data.release()
+  }
+  
+  var width: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[1]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var height: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[2]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var channel: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[3]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  
+  func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
+    let N = dim[0]
+    let C = dim[1]
+    let H = dim[2]
+    let W = dim[3]
+    let HXW = H * W
+    let CXHXW = C * H * W
    
-    func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
-        let N = dim[0]
-        let C = dim[1]
-        let H = dim[2]
-        let W = dim[3]
-        let HXW = H * W
-        let CXHXW = C * H * W
-        
-        var index: Int = 0
-        for n in 0..<N {
-            for h in 0..<H{
-                for w in 0..<W{
-                    for c in 0..<C{
-                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
-                        index += 1
-                    }
-                }
-            }
+    var index: Int = 0
+    for n in 0..<N {
+      for h in 0..<H{
+        for w in 0..<W{
+          for c in 0..<C{
+            newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
+            index += 1
+          }
        }
-        dim.swapeDimAt(index1: 1, index2: 3)
+      }
    }
+    dim.swapeDimAt(index1: 1, index2: 3)
+  }
 }

-
 extension Tensor {
-    
-    var debugDescription: String {
-        var str = "dim: \(dim) \n"
-        str += "MTLBuffer: \(self.buffer) \n"
-        for i in 0..<buffer.length/MemoryLayout<P>.size {
-            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
-        }
-        return str
+  
+  var debugDescription: String {
+    var str = "dim: \(dim) \n"
+    str += "MTLBuffer: \(self.buffer) \n"
+    for i in 0..<buffer.length/MemoryLayout<P>.size {
+      str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
    }
-    
-    func logDataPointer(header: String = "") {
-        print(header)
-        var str = ""
-        str += "data size: \(data.size) \n"
-        str += "dim: \(dim) \n"
-        for i in 0..<numel() {
-            str += " \(data.pointer[i])"
-        }
-        print(str)
+    return str
+  }
+  
+  func logDataPointer(header: String = "") {
+    print(header)
+    var str = ""
+    str += "data size: \(data.size) \n"
+    str += "dim: \(dim) \n"
+    for i in 0..<numel() {
+      str += " \(data.pointer[i])"
    }
-    
-    var description: String {
-        return debugDescription
-    }
-    
+    print(str)
+  }
+  
+  var description: String {
+    return debugDescription
+  }
+  
 }
--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -16,127 +16,163 @@ import Metal
 import Foundation

 class InputTexture {
-    let mtlTexture: MTLTexture
-    let expectDim: Dim
-    init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
-        mtlTexture = inMTLTexture
-        expectDim = inExpectDim
-    }
-    
+  let mtlTexture: MTLTexture
+  let expectDim: Dim
+  init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
+    mtlTexture = inMTLTexture
+    expectDim = inExpectDim
+  }
 }

 extension InputTexture {
-    var description: String {
-        get{
-            return mtlTexture.description
-        }
+  var description: String {
+    get{
+      return mtlTexture.description
    }
-    
-    var debugDescription: String {
-        get {
-            return mtlTexture.debugDescription ?? " MetalTexture "
-        }
+  }
+  
+  var debugDescription: String {
+    get {
+      return mtlTexture.debugDescription ?? " MetalTexture "
    }
+  }
 }

-public class Texture<P: PrecisionType>: Tensorial {
-    var dim: Dim
-    let textureDesc: MTLTextureDescriptor
-    var metalTexture: MTLTexture
-    
-    init(device: MTLDevice, inDim: Dim, inLayout: DataLayout = .NHWC) {
-        dim = inDim
-        layout = inLayout
-        let tmpTextureDes = MTLTextureDescriptor.init()
-        if inDim.cout() == 1 {
-            tmpTextureDes.width = inDim[0]
-            tmpTextureDes.textureType = .type1D
-        } else if inDim.cout() == 4 {
-            tmpTextureDes.height = inDim[1]
-            tmpTextureDes.width = inDim[2]
-//            print("n : \(inDim[0])")
-//            print(inDim[3] * inDim[0])
-            tmpTextureDes.depth = 1
-            tmpTextureDes.arrayLength = (inDim[3] * inDim[0] + 3)/4
-            tmpTextureDes.textureType = .type2DArray
-        } else if inDim.cout() == 2 {
-            tmpTextureDes.height = 1
-            tmpTextureDes.width = 1
-            tmpTextureDes.depth = 1
-            tmpTextureDes.arrayLength = (inDim[0] * inDim[1] + 3)/4
-            tmpTextureDes.textureType = .type2DArray
-        } else {
-            fatalError(" not suuprt ")
-        }
-        
-        if MemoryLayout<P>.size == 1 {
-            tmpTextureDes.pixelFormat = .rgba8Unorm
-        } else if MemoryLayout<P>.size == 2 {
-            tmpTextureDes.pixelFormat = .rgba16Float
-        } else if MemoryLayout<P>.size == 4 {
-//            tmpTextureDes.pixelFormat = .r32Float
-            tmpTextureDes.pixelFormat = .rgba32Float

-        }
-//        tmpTextureDes.pixelFormat = .rgba16Float
+/*
+ 4 维 tensor 存储 texture，要考虑 transpose
+ transpose 之后的维度是 [a, b, c, d]，对应的texture_2darray
+ .width = c
+ .height = b
+ .len = a * d + 3 / 4
+ 
+低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
+ 
+// TODO transpose 对于低维 tensor 的扩展原则。。。
+// [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
+// [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
+// [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
+
+3 维 tensor [a, b, c] 对应的 texture_2darray,
+.width = c
+.height = b
+.len = a + 3 / 4
+ 
+ 2 维 tensor [a, b] 对应的 texture_2darray
+ .width = b + 3 / 4
+ .height = a
+ .len = 1
+ 
+ 1 维 tensor [a] 对应的 texture_2darray
+ .width = a + 3 / 4
+ .height = 1
+ .len = 1
+ */
+

-        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-        tmpTextureDes.storageMode = .shared
-        textureDesc = tmpTextureDes
-        metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
+public class Texture<P: PrecisionType>: Tensorial {
+  var dim: Dim
+  public var tensorDim: Dim
+  public var padToFourDim: Dim
+  private var textureDesc: MTLTextureDescriptor!
+  public var metalTexture: MTLTexture!
+  var transpose: [Int] = [0, 1, 2, 3]
+  
+  func toTensor() -> [Float32] {
+    guard  padToFourDim.cout() == 4 else {
+      fatalError("- not support -")
    }
+    return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+  }
+  
+  func realNHWC() -> [Float32] {
+    guard padToFourDim.cout() == 4 else {
+      fatalError(" - not support - ")
+    }
+    return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+  }
+  
+  func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
+    transpose = inTranspose
+    for i in 0..<(4 - tensorDim.cout()) {
+      if i != inTranspose[i] {
+        fatalError()
+      }
+    }
+    let newDim = transpose.map { padToFourDim[$0] }
    
-//    required public init(inDim: Dim, inLayout: DataLayout = .NHWC, inTexture: MTLTexture) {
-//        dim = inDim
-//        layout = inLayout
-//        metalTexture = inTexture
-//        let tmpTextureDes = MTLTextureDescriptor.init()
-//        
-//        if inDim.cout() == 1 {
-//            tmpTextureDes.width = inDim[0]
-//            tmpTextureDes.textureType = .type1D
-//        } else if inDim.cout() == 2 {
-//            tmpTextureDes.height = inDim[0]
-//            tmpTextureDes.width = inDim[1]
-//            tmpTextureDes.textureType = .type2D
-//        } else if inDim.cout() == 3 {
-//            fatalError(" not support texture dim 3")
-//        } else if inDim.cout() == 4 {
-//            tmpTextureDes.height = inDim[1]
-//            tmpTextureDes.width = inDim[2]
-//            tmpTextureDes.depth = inDim[3] * inDim[1]
-//            tmpTextureDes.textureType = .type2DArray
-//        }
-//        
-//        tmpTextureDes.pixelFormat = .r32Float
-//        tmpTextureDes.storageMode = .shared
-//        textureDesc = tmpTextureDes
-//        let device = MTLCreateSystemDefaultDevice()
-//        metalTexture = device!.makeTexture(descriptor: tmpTextureDes)!
-//    }
+    let newLayout = transpose.map { layout.layoutWithDim[$0] }
    
-//    init() {
-//        dim = Dim.init(inDim: [])
-//        layout = .NCHW
-//        let device = MTLCreateSystemDefaultDevice()
-//        textureDesc = MTLTextureDescriptor.init()
-//        metalTexture = device!.makeTexture(descriptor: textureDesc)!
-//    }
+    layout = DataLayout.init(newLayout)
+    dim = Dim.init(inDim: newDim)
    
-    private(set) var layout: DataLayout
-}
-
-extension Texture {
-    public var description: String {
-        return debugDescription
-    }
+    let tmpTextureDes = MTLTextureDescriptor.init()
+    tmpTextureDes.textureType = .type2DArray
+    tmpTextureDes.depth = 1
    
-    public var debugDescription: String{
-        var str = ""
-        str += "Dim: \(dim) \n value:[ "
-        str += "\(metalTexture)"
-        str += " ]"
-        return str
+    switch tensorDim.cout() {
+    case 4:
+      tmpTextureDes.width = newDim[2]
+      tmpTextureDes.height = newDim[1]
+      tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
+    case 3:
+      tmpTextureDes.width = newDim[3]
+      tmpTextureDes.height = newDim[2]
+      tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
+    case 2, 1:
+      tmpTextureDes.width = (newDim[3] + 3) / 4
+      tmpTextureDes.height = newDim[2]
+      tmpTextureDes.arrayLength = 1
+    default:
+      fatalError("unreachable")
+    }
+   
+    if computePrecision == .Float16 {
+      tmpTextureDes.pixelFormat = .rgba16Float
+    } else if computePrecision == .Float32 {
+      tmpTextureDes.pixelFormat = .rgba32Float
    }
    
+    tmpTextureDes.usage = [.shaderRead, .shaderWrite]
+    tmpTextureDes.storageMode = .shared
+    textureDesc = tmpTextureDes
+    metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
+  }
+  
+  init(device: MTLDevice, inDim: Dim) {
+    var fourDim: Dim
+    if inDim.cout() == 4 {
+      fourDim = inDim
+    } else if inDim.cout() < 4 {
+      var fourDimNum: [Int] = []
+      for _ in 0..<(4 - inDim.cout()) {
+        fourDimNum.append(1)
+      }
+      fourDimNum.append(contentsOf: inDim.dims)
+      fourDim = Dim.init(inDim: fourDimNum)
+    } else {
+      fatalError(" not support ")
+    }
+    tensorDim = inDim
+    dim = fourDim
+    padToFourDim = fourDim
+    layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
+  }
+  
+  private(set) var layout: DataLayout
+}
+
+extension Texture {
+  public var description: String {
+    return debugDescription
+  }
+  
+  public var debugDescription: String{
+    var str = ""
+    str += "Dim: \(dim) \n value:[ "
+    str += "\(metalTexture)"
+    str += " ]"
+    return str
+  }
+  
 }
--- a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
+++ b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
@@ -14,12 +14,15 @@

 #pragma once

+#import "PaddleMobileCPU.h"
+#import "CPUCompute.h"
+#import "PaddleMobileGPU.h"
 #import <UIKit/UIKit.h>

 //! Project version number for paddle_mobile.
-FOUNDATION_EXPORT double paddle_mobileVersionNumber;
+//FOUNDATION_EXPORT double paddle_mobileVersionNumber;

 //! Project version string for paddle_mobile.
-FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];
+//FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];


--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -311,6 +311,8 @@ int get_aligned_filter_num(int num) {

 void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
+  filter_tensor->scale[0] = float(max_value / 127.0);
+  filter_tensor->scale[1] = float(127.0 / max_value);
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -676,11 +676,11 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];

-  PADDLE_MOBILE_ENFORCE(id < ops.size(), "Index out of range");
-  auto last_op = id < 0 ? ops[ops.size() - 1] : ops[id];
-  auto output_map = last_op->Outputs();
-  std::vector<std::string> out_keys = last_op->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "the last op contains no output");
+  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
+  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
+  auto output_map = op->Outputs();
+  std::vector<std::string> out_keys = op->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -50,8 +50,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {

  void RunImpl() const {
    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());
-    auto input_ptr = input->data<float>();
    fpga::format_image(input);
+    auto input_ptr = input->data<float>();
    Tensor *output = param_.Out();
    auto output_ptr = output->data<float>();


--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -47,7 +47,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
  concatArgs.image_num = (uint32_t)image_num;
  concatArgs.images_in = images_in;
  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->mutable_data<float>();
+  concatArgs.image_out = (half *)out->data<float>();
  concatArgs.scale_out = out->scale;
  concatArgs.channel_num = channel_num;
  concatArgs.height = (uint32_t)height;

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -39,8 +39,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  args.image.height = 1;
  args.image.width = 1;
  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = float_input->mutable_data<float>();
-
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
  param->SetFloatInput(float_input);
  param->SetFpgaArgs(args);
  return true;

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -21,8 +21,15 @@ int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
+<<<<<<< HEAD
+  auto program = loader.Load(g_mobilenet_ssd, false, false);
+  //  auto program = loader.Load(g_googlenet_combine + "/model",
+  //  g_googlenet_combine +
+  //    "/params", true);
+=======
  //  auto program = loader.Load(g_googlenet, true);
  //  auto program = loader.Load(g_mobilenet_ssd, true);
+>>>>>>> e60ab7ae5a43b9cc788813877fbfffc67c87b5f3

  auto program = loader.Load(std::string(g_ocr) + "/model",
                             std::string(g_ocr) + "/params", false);