diff --git a/CMakeLists.txt b/CMakeLists.txt
index f824a25efb870556d88e62bc198f3afd3954de79..7c693afd0476ee8d3df20aacda120cae9aa4c205 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,8 +131,8 @@ endif()
 
 if(IS_IOS)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
     list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
 endif()
 
diff --git a/README.md b/README.md
index fd5222655821e36fe194225a4d71a3b60b8a89d5..de7dd530c94b4a3055cbf07a4a19a55c21457ed0 100644
--- a/README.md
+++ b/README.md
@@ -69,8 +69,18 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 
 - **苹果设备的GPU Metal实现**
 
-    基于Metal实现的苹果设备的GPU预测库，也已经在实现中，近期也会有相应可运行版本。
-     
+|mobilenetfssd|速度|
+|------------|-----|
+|A9(ms)|33.78|
+|A10(ms)|24.05|
+|A11(ms)|17.15|
+|||
+|genet|速度|
+|A9(ms) |3.49|
+|A10(ms)|2.54|
+|A11(ms)|1.43|
+
+
 - **FPGA**
 
     FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
index f3ab9fc66a072cd5b0bbba56ae99258f04be3612..d6114880efcaf528bd26fcda11e08ec68d943575 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
@@ -8,22 +8,29 @@
 
 /* Begin PBXBuildFile section */
 		30D0ED21F392CFA3885B1002 /* Pods_paddle_mobile_demo.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */; };
+		C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = C2E67E5D21524E460013F575 /* LoadPointerViewController.m */; };
 		FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC013927210204A3008100E3 /* PreProcessKernel.metal */; };
 		FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8120E11C550081E9F8 /* AppDelegate.swift */; };
 		FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8320E11C550081E9F8 /* ViewController.swift */; };
 		FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8520E11C550081E9F8 /* Main.storyboard */; };
 		FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8820E11C560081E9F8 /* Assets.xcassets */; };
 		FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */; };
-		FC3602C82108580600FACB58 /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602C72108580600FACB58 /* MetalHelper.swift */; };
-		FC918191211DBC3500B6F354 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FC918190211DBC3500B6F354 /* paddle-mobile.png */; };
-		FC918193211DC70500B6F354 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FC918192211DC70500B6F354 /* iphone.JPG */; };
-		FCD04E6320F3146B0007374F /* params in Resources */ = {isa = PBXBuildFile; fileRef = FCD04E6120F3146A0007374F /* params */; };
-		FCD04E6420F3146B0007374F /* model in Resources */ = {isa = PBXBuildFile; fileRef = FCD04E6220F3146A0007374F /* model */; };
-		FCDFD3FB211D72C3005AB38B /* ModelHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDFD3FA211D72C3005AB38B /* ModelHelper.swift */; };
-		FCDFD41B211D91C7005AB38B /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD41A211D91C7005AB38B /* synset.txt */; };
+		FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; };
+		FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCC214D27920094B8E5 /* VideoCapture.swift */; };
+		FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC542122EF5400D94F7E /* MetalHelper.swift */; };
 		FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; };
 		FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
-		FCEEE7D4210627A000444BEC /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FCEEE7D3210627A000444BEC /* banana.jpeg */; };
+		FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */; };
+		FCFE9B692152858600DECA15 /* hand.jpg.zip in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B632152858600DECA15 /* hand.jpg.zip */; };
+		FCFE9B6A2152858600DECA15 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B642152858600DECA15 /* synset.txt */; };
+		FCFE9B6B2152858600DECA15 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B652152858600DECA15 /* banana.jpeg */; };
+		FCFE9B6C2152858600DECA15 /* hand.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B662152858600DECA15 /* hand.jpg */; };
+		FCFE9B6D2152858600DECA15 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B672152858600DECA15 /* iphone.JPG */; };
+		FCFE9B6E2152858600DECA15 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B682152858600DECA15 /* paddle-mobile.png */; };
+		FCFE9C512152859600DECA15 /* genet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B752152859500DECA15 /* genet_params */; };
+		FCFE9C522152859600DECA15 /* genet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B762152859500DECA15 /* genet_model */; };
+		FCFE9D232152859600DECA15 /* ar_model in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9C4C2152859500DECA15 /* ar_model */; };
+		FCFE9D242152859600DECA15 /* ar_params in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9C4D2152859500DECA15 /* ar_params */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXCopyFilesBuildPhase section */
@@ -44,6 +51,8 @@
 		081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.release.xcconfig"; sourceTree = "<group>"; };
 		18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_demo.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.debug.xcconfig"; sourceTree = "<group>"; };
+		C2E67E5C21524E460013F575 /* LoadPointerViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LoadPointerViewController.h; sourceTree = "<group>"; };
+		C2E67E5D21524E460013F575 /* LoadPointerViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = LoadPointerViewController.m; sourceTree = "<group>"; };
 		FC013927210204A3008100E3 /* PreProcessKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreProcessKernel.metal; sourceTree = "<group>"; };
 		FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-demo.app"; sourceTree = BUILT_PRODUCTS_DIR; };
 		FC039B8120E11C550081E9F8 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
@@ -52,15 +61,23 @@
 		FC039B8820E11C560081E9F8 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		FC039B8B20E11C560081E9F8 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
 		FC039B8D20E11C560081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		FC3602C72108580600FACB58 /* MetalHelper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = MetalHelper.swift; path = "../../paddle-mobile-unit-test/paddle-mobile-unit-test/MetalHelper.swift"; sourceTree = "<group>"; };
-		FC918190211DBC3500B6F354 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
-		FC918192211DC70500B6F354 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
-		FCD04E6120F3146A0007374F /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
-		FCD04E6220F3146A0007374F /* model */ = {isa = PBXFileReference; lastKnownFileType = file; path = model; sourceTree = "<group>"; };
-		FCDFD3FA211D72C3005AB38B /* ModelHelper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ModelHelper.swift; sourceTree = "<group>"; };
-		FCDFD41A211D91C7005AB38B /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "paddle-mobile-demo-Bridging-Header.h"; sourceTree = "<group>"; };
+		FC4FD97B2140EE250073E130 /* libc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libc++.tbd"; path = "usr/lib/libc++.tbd"; sourceTree = SDKROOT; };
+		FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = "<group>"; };
+		FC803BCC214D27920094B8E5 /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = "<group>"; };
+		FCBCCC542122EF5400D94F7E /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalHelper.swift; sourceTree = "<group>"; };
 		FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		FCEEE7D3210627A000444BEC /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultiPredictViewController.swift; sourceTree = "<group>"; };
+		FCFE9B632152858600DECA15 /* hand.jpg.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = hand.jpg.zip; sourceTree = "<group>"; };
+		FCFE9B642152858600DECA15 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FCFE9B652152858600DECA15 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FCFE9B662152858600DECA15 /* hand.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = hand.jpg; sourceTree = "<group>"; };
+		FCFE9B672152858600DECA15 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
+		FCFE9B682152858600DECA15 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
+		FCFE9B752152859500DECA15 /* genet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_params; sourceTree = "<group>"; };
+		FCFE9B762152859500DECA15 /* genet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_model; sourceTree = "<group>"; };
+		FCFE9C4C2152859500DECA15 /* ar_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = ar_model; sourceTree = "<group>"; };
+		FCFE9C4D2152859500DECA15 /* ar_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = ar_params; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -88,6 +105,7 @@
 		7B7DED984E9EE7BFB45E24E8 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				FC4FD97B2140EE250073E130 /* libc++.tbd */,
 				18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */,
 			);
 			name = Frameworks;
@@ -115,49 +133,82 @@
 		FC039B8020E11C550081E9F8 /* paddle-mobile-demo */ = {
 			isa = PBXGroup;
 			children = (
-				FC0E2C2020EDC03B009C1FAC /* models */,
-				FC0E2C1D20EDC030009C1FAC /* images */,
+				FCFE9B6F2152859500DECA15 /* models */,
+				FCFE9B622152858600DECA15 /* images */,
+				FC803BCA214D27920094B8E5 /* VideoCapture */,
+				FC8CFED2213519540094D569 /* Net */,
 				FC039B8120E11C550081E9F8 /* AppDelegate.swift */,
-				FC013927210204A3008100E3 /* PreProcessKernel.metal */,
 				FC039B8320E11C550081E9F8 /* ViewController.swift */,
 				FC039B8520E11C550081E9F8 /* Main.storyboard */,
 				FC039B8820E11C560081E9F8 /* Assets.xcassets */,
 				FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */,
 				FC039B8D20E11C560081E9F8 /* Info.plist */,
-				FC3602C72108580600FACB58 /* MetalHelper.swift */,
-				FCDFD3FA211D72C3005AB38B /* ModelHelper.swift */,
+				FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */,
+				FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */,
+				C2E67E5C21524E460013F575 /* LoadPointerViewController.h */,
+				C2E67E5D21524E460013F575 /* LoadPointerViewController.m */,
 			);
 			path = "paddle-mobile-demo";
 			sourceTree = "<group>";
 		};
-		FC0E2C1D20EDC030009C1FAC /* images */ = {
+		FC803BCA214D27920094B8E5 /* VideoCapture */ = {
+			isa = PBXGroup;
+			children = (
+				FC803BCB214D27920094B8E5 /* FPSCounter.swift */,
+				FC803BCC214D27920094B8E5 /* VideoCapture.swift */,
+			);
+			path = VideoCapture;
+			sourceTree = "<group>";
+		};
+		FC8CFED2213519540094D569 /* Net */ = {
+			isa = PBXGroup;
+			children = (
+				FC013927210204A3008100E3 /* PreProcessKernel.metal */,
+				FCBCCC542122EF5400D94F7E /* MetalHelper.swift */,
+			);
+			path = Net;
+			sourceTree = "<group>";
+		};
+		FCFE9B622152858600DECA15 /* images */ = {
 			isa = PBXGroup;
 			children = (
-				FC918192211DC70500B6F354 /* iphone.JPG */,
-				FC918190211DBC3500B6F354 /* paddle-mobile.png */,
-				FCDFD41A211D91C7005AB38B /* synset.txt */,
-				FCEEE7D3210627A000444BEC /* banana.jpeg */,
+				FCFE9B632152858600DECA15 /* hand.jpg.zip */,
+				FCFE9B642152858600DECA15 /* synset.txt */,
+				FCFE9B652152858600DECA15 /* banana.jpeg */,
+				FCFE9B662152858600DECA15 /* hand.jpg */,
+				FCFE9B672152858600DECA15 /* iphone.JPG */,
+				FCFE9B682152858600DECA15 /* paddle-mobile.png */,
 			);
 			name = images;
 			path = ../../images;
 			sourceTree = "<group>";
 		};
-		FC0E2C2020EDC03B009C1FAC /* models */ = {
+		FCFE9B6F2152859500DECA15 /* models */ = {
 			isa = PBXGroup;
 			children = (
-				FCD04E6020F3146A0007374F /* mobilenet */,
+				FCFE9B742152859500DECA15 /* genet */,
+				FCFE9C4B2152859500DECA15 /* fluid_fssd_new_ar */,
 			);
 			name = models;
 			path = ../../models;
 			sourceTree = "<group>";
 		};
-		FCD04E6020F3146A0007374F /* mobilenet */ = {
+		FCFE9B742152859500DECA15 /* genet */ = {
+			isa = PBXGroup;
+			children = (
+				FCFE9B752152859500DECA15 /* genet_params */,
+				FCFE9B762152859500DECA15 /* genet_model */,
+			);
+			path = genet;
+			sourceTree = "<group>";
+		};
+		FCFE9C4B2152859500DECA15 /* fluid_fssd_new_ar */ = {
 			isa = PBXGroup;
 			children = (
-				FCD04E6120F3146A0007374F /* params */,
-				FCD04E6220F3146A0007374F /* model */,
+				FCFE9C4C2152859500DECA15 /* ar_model */,
+				FCFE9C4D2152859500DECA15 /* ar_params */,
 			);
-			path = mobilenet;
+			path = fluid_fssd_new_ar;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
@@ -195,6 +246,7 @@
 				TargetAttributes = {
 					FC039B7D20E11C550081E9F8 = {
 						CreatedOnToolsVersion = 9.3.1;
+						LastSwiftMigration = 0940;
 					};
 				};
 			};
@@ -221,14 +273,18 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				FCD04E6320F3146B0007374F /* params in Resources */,
+				FCFE9D232152859600DECA15 /* ar_model in Resources */,
 				FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */,
-				FC918191211DBC3500B6F354 /* paddle-mobile.png in Resources */,
+				FCFE9C522152859600DECA15 /* genet_model in Resources */,
+				FCFE9D242152859600DECA15 /* ar_params in Resources */,
+				FCFE9B6E2152858600DECA15 /* paddle-mobile.png in Resources */,
+				FCFE9C512152859600DECA15 /* genet_params in Resources */,
+				FCFE9B692152858600DECA15 /* hand.jpg.zip in Resources */,
 				FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */,
-				FCEEE7D4210627A000444BEC /* banana.jpeg in Resources */,
-				FC918193211DC70500B6F354 /* iphone.JPG in Resources */,
-				FCDFD41B211D91C7005AB38B /* synset.txt in Resources */,
-				FCD04E6420F3146B0007374F /* model in Resources */,
+				FCFE9B6A2152858600DECA15 /* synset.txt in Resources */,
+				FCFE9B6B2152858600DECA15 /* banana.jpeg in Resources */,
+				FCFE9B6D2152858600DECA15 /* iphone.JPG in Resources */,
+				FCFE9B6C2152858600DECA15 /* hand.jpg in Resources */,
 				FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -280,10 +336,13 @@
 			buildActionMask = 2147483647;
 			files = (
 				FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */,
-				FCDFD3FB211D72C3005AB38B /* ModelHelper.swift in Sources */,
+				FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */,
 				FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */,
+				FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */,
+				FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */,
+				FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */,
+				C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */,
 				FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */,
-				FC3602C82108580600FACB58 /* MetalHelper.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -428,19 +487,23 @@
 			baseConfigurationReference = 878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = A798K58VVL;
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.paddlemobile.metal;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				PROVISIONING_PROFILE = "";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 4.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -451,19 +514,22 @@
 			baseConfigurationReference = 081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "iPhone Developer";
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = A798K58VVL;
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				PRODUCT_BUNDLE_IDENTIFIER = com.paddlemobile.metal;
+				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				PROVISIONING_PROFILE = "";
 				PROVISIONING_PROFILE_SPECIFIER = "";
+				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
 				SWIFT_VERSION = 4.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
index 54dad2b5bf721f3d132bad2502d30b34ca0773ab..537fb06ed9e5b9100bea43b7acae9c014e0f4a78 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
@@ -19,7 +19,6 @@ class AppDelegate: UIResponder, UIApplicationDelegate {
 
     var window: UIWindow?
 
-
     func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
         // Override point for customization after application launch.
         return true
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
index a5efadeb97ccc41449dc32a2c1dfcdfcf9fceac5..d72694fdacf5b46821ba6422fa77e095f92382b9 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
@@ -11,6 +11,34 @@
         <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
     </dependencies>
     <scenes>
+        <!--Multi Predict View Controller-->
+        <scene sceneID="ec4-AW-9Vs">
+            <objects>
+                <viewController id="Vwd-lt-764" customClass="MultiPredictViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="55D-rz-Ex6">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="TQt-X9-PdF">
+                                <rect key="frame" x="164" y="318" width="46" height="30"/>
+                                <state key="normal" title="Button"/>
+                                <connections>
+                                    <action selector="predictAct:" destination="Vwd-lt-764" eventType="touchUpInside" id="d4z-Cv-6jY"/>
+                                </connections>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerY" secondItem="55D-rz-Ex6" secondAttribute="centerY" id="bL3-wr-TcH"/>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerX" secondItem="55D-rz-Ex6" secondAttribute="centerX" id="sBi-RQ-sJn"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="bsd-h4-RYZ"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="68E-SG-96s" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-559" y="686"/>
+        </scene>
         <!--View Controller-->
         <scene sceneID="tne-QT-ifu">
             <objects>
@@ -20,12 +48,11 @@
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                         <subviews>
                             <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
-                                <rect key="frame" x="0.0" y="20" width="375" height="247"/>
+                                <rect key="frame" x="0.0" y="20" width="225" height="247"/>
                             </imageView>
-                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Thread:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
-                                <rect key="frame" x="10" y="538" width="68" height="24"/>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" horizontalCompressionResistancePriority="749" text="Platform:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
+                                <rect key="frame" x="10" y="538" width="35" height="24"/>
                                 <constraints>
-                                    <constraint firstAttribute="width" constant="68" id="Q5J-tq-JSX"/>
                                     <constraint firstAttribute="height" constant="24" id="SYv-As-Si8"/>
                                 </constraints>
                                 <fontDescription key="fontDescription" type="system" pointSize="20"/>
@@ -33,12 +60,12 @@
                                 <nil key="highlightedColor"/>
                             </label>
                             <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="DlO-dk-RMr">
-                                <rect key="frame" x="88" y="510.5" width="287" height="80"/>
+                                <rect key="frame" x="55" y="510.5" width="320" height="80"/>
                                 <constraints>
                                     <constraint firstAttribute="height" constant="80" id="Sbi-05-Mwd"/>
                                 </constraints>
                             </pickerView>
-                            <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5">
+                            <pickerView contentMode="scaleToFill" horizontalCompressionResistancePriority="749" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5">
                                 <rect key="frame" x="85" y="401" width="290" height="80"/>
                                 <constraints>
                                     <constraint firstAttribute="height" constant="80" id="yAL-JY-G6b"/>
@@ -47,7 +74,6 @@
                             <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Models" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="avL-VK-Kha">
                                 <rect key="frame" x="10" y="429" width="65" height="24"/>
                                 <constraints>
-                                    <constraint firstAttribute="width" constant="65" id="6oA-g2-Xq4"/>
                                     <constraint firstAttribute="height" constant="24" id="EwE-B3-z2R"/>
                                 </constraints>
                                 <fontDescription key="fontDescription" type="system" pointSize="20"/>
@@ -142,9 +168,14 @@
                                 <fontDescription key="fontDescription" type="system" pointSize="15"/>
                                 <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                             </textView>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Cil-py-NiA">
+                                <rect key="frame" x="225" y="20" width="150" height="247"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </view>
                         </subviews>
                         <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
                         <constraints>
+                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="Cil-py-NiA" secondAttribute="bottom" constant="10" id="16p-IK-b5X"/>
                             <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="VQn-bS-fWp" secondAttribute="trailing" constant="10" id="1Xg-0h-9SE"/>
                             <constraint firstItem="avL-VK-Kha" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="2t9-hS-VXa"/>
                             <constraint firstItem="R90-Yf-S6g" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="76b-Ny-1Og"/>
@@ -159,11 +190,12 @@
                             <constraint firstItem="XpL-9M-UOp" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KWW-qT-Rzf"/>
                             <constraint firstItem="6MG-gv-hD5" firstAttribute="centerY" secondItem="avL-VK-Kha" secondAttribute="centerY" id="KZa-YZ-DEs"/>
                             <constraint firstItem="2EB-m2-a3L" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="Le3-TN-zOL"/>
-                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="MeS-HQ-voE"/>
+                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" constant="-150" id="MeS-HQ-voE"/>
                             <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="ZZh-fw-LwK" secondAttribute="bottom" constant="10" id="NUL-Ta-VI8"/>
                             <constraint firstItem="m5L-O7-P31" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="RFA-z1-9aB"/>
                             <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="a3K-ri-NVs" secondAttribute="width" id="Rp6-Bh-BN3"/>
                             <constraint firstItem="6MG-gv-hD5" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="S0W-0G-75m"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="UNc-Et-9Yv"/>
                             <constraint firstItem="w7H-Sk-Rai" firstAttribute="leading" secondItem="wUL-9N-u1V" secondAttribute="trailing" id="VBM-8b-jP0"/>
                             <constraint firstItem="VQn-bS-fWp" firstAttribute="top" secondItem="m5L-O7-P31" secondAttribute="bottom" constant="8" id="VpS-4N-mOo"/>
                             <constraint firstItem="wUL-9N-u1V" firstAttribute="top" secondItem="2EB-m2-a3L" secondAttribute="bottom" constant="35" id="VpU-j2-gaE"/>
@@ -175,10 +207,12 @@
                             <constraint firstItem="ZZh-fw-LwK" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="eIC-fZ-OEE"/>
                             <constraint firstItem="976-fk-Kx2" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="fFg-pB-eyU"/>
                             <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="40" id="fG6-0p-I0P"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="gGK-DB-ibv"/>
                             <constraint firstItem="XpL-9M-UOp" firstAttribute="leading" secondItem="w7H-Sk-Rai" secondAttribute="trailing" id="guC-Db-cA9"/>
                             <constraint firstItem="6MG-gv-hD5" firstAttribute="leading" secondItem="avL-VK-Kha" secondAttribute="trailing" constant="10" id="jNW-iC-u7V"/>
                             <constraint firstItem="4ey-Xr-U4e" firstAttribute="bottom" secondItem="6Tk-OE-BBY" secondAttribute="bottom" id="o1X-q5-P7j"/>
                             <constraint firstItem="6MG-gv-hD5" firstAttribute="top" secondItem="VQn-bS-fWp" secondAttribute="bottom" constant="8" id="tAE-ss-jlA"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="leading" secondItem="ZZh-fw-LwK" secondAttribute="trailing" id="teJ-PP-h2R"/>
                             <constraint firstItem="4ey-Xr-U4e" firstAttribute="top" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="10" id="udc-wT-jqd"/>
                             <constraint firstItem="ZZh-fw-LwK" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" id="vXI-l2-CjL"/>
                             <constraint firstItem="VQn-bS-fWp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="wtI-Dl-YPq"/>
@@ -195,11 +229,81 @@
                         <outlet property="resultTextView" destination="VQn-bS-fWp" id="306-c7-3vM"/>
                         <outlet property="selectImageView" destination="ZZh-fw-LwK" id="afR-Bv-6AW"/>
                         <outlet property="threadPickerView" destination="DlO-dk-RMr" id="Kk4-QV-b5o"/>
+                        <outlet property="videoView" destination="Cil-py-NiA" id="QY2-BP-SNS"/>
                     </connections>
                 </viewController>
                 <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
             </objects>
-            <point key="canvasLocation" x="-724" y="98.50074962518741"/>
+            <point key="canvasLocation" x="-1449" y="-3"/>
+        </scene>
+        <!--Load Pointer View Controller-->
+        <scene sceneID="56v-9i-I4d">
+            <objects>
+                <viewController id="4MS-jc-i6A" customClass="LoadPointerViewController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="VbZ-nk-rJR">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="2p5-S3-M4T">
+                                <rect key="frame" x="16" y="63" width="240" height="128"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="37q-nm-0H7">
+                                <rect key="frame" x="38" y="610" width="42" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="ofW-G3-KST"/>
+                                    <constraint firstAttribute="width" constant="42" id="pwd-tO-zcJ"/>
+                                </constraints>
+                                <state key="normal" title="Image"/>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="fAg-ai-yaA">
+                                <rect key="frame" x="119" y="610" width="34" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="IES-jf-Z1n"/>
+                                    <constraint firstAttribute="width" constant="34" id="jxK-Xn-WCE"/>
+                                </constraints>
+                                <state key="normal" title="Load"/>
+                                <connections>
+                                    <action selector="loaderButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="3cy-PD-aiE"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="pdS-6e-Pd1">
+                                <rect key="frame" x="185" y="610" width="49" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="49" id="ddY-uM-fzA"/>
+                                    <constraint firstAttribute="height" constant="30" id="yKd-YL-UML"/>
+                                </constraints>
+                                <state key="normal" title="Predict"/>
+                                <connections>
+                                    <action selector="predictButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="sOH-iT-s1w"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="DZa-sd-lY7">
+                                <rect key="frame" x="279" y="610" width="34" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="34" id="aSO-4q-PgA"/>
+                                    <constraint firstAttribute="height" constant="30" id="eAt-Uc-BxX"/>
+                                </constraints>
+                                <state key="normal" title="clear"/>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="37q-nm-0H7" secondAttribute="bottom" constant="27" id="4Wf-Uh-gvr"/>
+                            <constraint firstItem="DZa-sd-lY7" firstAttribute="leading" secondItem="pdS-6e-Pd1" secondAttribute="trailing" constant="45" id="8dB-uI-cs9"/>
+                            <constraint firstItem="fAg-ai-yaA" firstAttribute="leading" secondItem="37q-nm-0H7" secondAttribute="trailing" constant="39" id="EAV-Oq-jeD"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="fAg-ai-yaA" secondAttribute="bottom" constant="27" id="Px0-A9-Eql"/>
+                            <constraint firstItem="pdS-6e-Pd1" firstAttribute="leading" secondItem="fAg-ai-yaA" secondAttribute="trailing" constant="32" id="ZUR-Nv-aNb"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="pdS-6e-Pd1" secondAttribute="bottom" constant="27" id="kPx-mt-ab9"/>
+                            <constraint firstItem="37q-nm-0H7" firstAttribute="leading" secondItem="vsb-FH-h7h" secondAttribute="leading" constant="38" id="trH-Fq-sSv"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="DZa-sd-lY7" secondAttribute="bottom" constant="27" id="yNJ-hq-2Qg"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="vsb-FH-h7h"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="hGb-Pb-icS" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-721" y="-427"/>
         </scene>
     </scenes>
     <resources>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
new file mode 100644
index 0000000000000000000000000000000000000000..a876c236219817bf146cfa4a77eb9421f8472971
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
@@ -0,0 +1,13 @@
+//
+//  LoadPointerViewController.h
+//  paddle-mobile-demo
+//
+//  Created by Xiao,Haichun on 2018/9/19.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+#import <UIKit/UIKit.h>
+
+@interface LoadPointerViewController : UIViewController
+
+@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
new file mode 100644
index 0000000000000000000000000000000000000000..857745686fbe750de08e8be357ccf5a4159eaae8
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
@@ -0,0 +1,171 @@
+//
+//  LoadPointerViewController.m
+//  paddle-mobile-demo
+//
+//  Created by Xiao,Haichun on 2018/9/19.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+#import "LoadPointerViewController.h"
+#import <Metal/Metal.h>
+#import "paddle-mobile-demo-Bridging-Header.h"
+
+@interface LoadPointerViewController ()
+
+@property (strong, nonatomic) id<MTLDevice> device;
+@property (strong, nonatomic) id<MTLTexture> texture;
+@property (strong, nonatomic) id<MTLCommandQueue> queue;
+@property (strong, nonatomic) PaddleMobileGPU *runner;
+@property (strong, nonatomic) ModelConfig *modelConfig;
+
+@end
+
+@implementation LoadPointerViewController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+  
+  
+  self.device = MTLCreateSystemDefaultDevice();
+  
+  self.queue = [self.device newCommandQueue];
+  
+    // Do any additional setup after loading the view.
+//  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"genet_model" withExtension:nil].path;
+//  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"genet_params" withExtension:nil].path;
+  
+  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"ar_model" withExtension:nil].path;
+  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"ar_params" withExtension:nil].path;
+
+  long fileSize;
+  FILE *fp;
+  fp = fopen([modelPath UTF8String], "rb");
+  fseek(fp, 0, SEEK_END);
+  fileSize = ftell(fp);
+  rewind(fp);
+  void *buffer = malloc(fileSize);
+  fread(buffer, 1, fileSize, fp);
+  fclose(fp);
+  
+  long paramfileSize;
+  FILE *parmaFilePointer;
+  parmaFilePointer = fopen([paramPath UTF8String], "rb");
+  fseek(parmaFilePointer, 0, SEEK_END);
+  paramfileSize = ftell(parmaFilePointer);
+  rewind(parmaFilePointer);
+  void *parmaBuffer = malloc(paramfileSize);
+  fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
+  fclose(parmaFilePointer);
+  
+  _modelConfig = [[ModelConfig alloc] init];
+//  _modelConfig.means = @[[NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0]];
+//  _modelConfig.scale = 0.017;
+//  _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:128.], [NSNumber numberWithFloat:128.0],[NSNumber numberWithFloat:3.0]];
+  _modelConfig.means = @[[NSNumber numberWithFloat:103.94], [NSNumber numberWithFloat:116.78], [NSNumber numberWithFloat:123.68]];
+  _modelConfig.scale = 1;
+  _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:160.], [NSNumber numberWithFloat:160.0],[NSNumber numberWithFloat:3.0]];
+  _modelConfig.modelPointer = buffer;
+  _modelConfig.modelSize = (int)fileSize;
+  _modelConfig.paramPointer = parmaBuffer;
+  _modelConfig.paramSize = (int)paramfileSize;
+}
+- (IBAction)loaderButtonPressed:(id)sender {
+//  _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:GenetType modelConfig:_modelConfig];
+  _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:MobileNetSSDType modelConfig:_modelConfig];
+  
+  [_runner load];
+}
+- (IBAction)predictButtonPressed:(id)sender {
+  [self predict];
+}
+
+- (id<MTLTexture>) createTextureFromImage:(UIImage*) image device:(id<MTLDevice>) device
+{
+  image  =[UIImage imageWithCGImage:[image CGImage]
+                              scale:[image scale]
+                        orientation: UIImageOrientationLeft];
+  
+  NSLog(@"orientation and size and stuff %ld %f %f", (long)image.imageOrientation, image.size.width, image.size.height);
+  
+  CGImageRef imageRef = image.CGImage;
+  
+  size_t width = self.view.frame.size.width;
+  size_t height = self.view.frame.size.height;
+  
+  size_t bitsPerComponent = CGImageGetBitsPerComponent(imageRef);
+  size_t bitsPerPixel = CGImageGetBitsPerPixel(imageRef);
+  
+  CGColorSpaceRef colorSpace = CGImageGetColorSpace(imageRef);
+  
+  CGImageAlphaInfo alphaInfo = CGImageGetAlphaInfo(imageRef);
+  
+  //  NSLog(@"%@ %u", colorSpace, alphaInfo);
+  
+  CGBitmapInfo bitmapInfo = kCGBitmapByteOrderDefault | alphaInfo;
+  //    NSLog(@"bitmap info %u", bitmapInfo);
+  
+  
+  CGContextRef context = CGBitmapContextCreate( NULL, width, height, bitsPerComponent, (bitsPerPixel / 8) * width, colorSpace, bitmapInfo);
+  
+  if( !context )
+  {
+    NSLog(@"Failed to load image, probably an unsupported texture type");
+    return nil;
+  }
+  
+  CGContextDrawImage( context, CGRectMake( 0, 0, width, height ), image.CGImage);
+  
+  
+  MTLPixelFormat format = MTLPixelFormatRGBA8Unorm;
+  
+  MTLTextureDescriptor *texDesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
+                                                                                     width:width
+                                                                                    height:height
+                                                                                 mipmapped:NO];
+  id<MTLTexture> texture = [device newTextureWithDescriptor:texDesc];
+  
+  [texture replaceRegion:MTLRegionMake2D(0, 0, width, height)
+             mipmapLevel:0
+               withBytes:CGBitmapContextGetData(context)
+             bytesPerRow:4 * width];
+  
+  return texture;
+}
+
+- (void)predict {
+  _texture = [self createTextureFromImage:[UIImage imageNamed:@"hand.jpg"] device:self.device];
+  NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
+  NSInteger max = 428;
+  for (int i = 0;i < max; i ++) {
+    [_runner predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
+      if (success) {
+        if (i == max -1) {
+          double time = [[NSDate date] timeIntervalSince1970] - startTime;
+          time = (time/max)*1000;
+          NSLog(@"gap ==== %fms",time);
+        }
+//        for (int i = 0; i < result.count; i ++) {
+//          NSNumber *number = result[i];
+//          NSLog(@"result %d = %f:",i, [number floatValue]);
+//        }
+      }
+    }];
+  }
+}
+
+- (void)didReceiveMemoryWarning {
+    [super didReceiveMemoryWarning];
+    // Dispose of any resources that can be recreated.
+}
+
+/*
+#pragma mark - Navigation
+
+// In a storyboard-based application, you will often want to do a little preparation before navigation
+- (void)prepareForSegue:(UIStoryboardSegue *)segue sender:(id)sender {
+    // Get the new view controller using [segue destinationViewController].
+    // Pass the selected object to the new view controller.
+}
+*/
+
+@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
deleted file mode 100644
index 74fa89d93e042f90fe1b590a596ec584fff67f6d..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift
+++ /dev/null
@@ -1,48 +0,0 @@
-//
-//  MetalHelper.swift
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/7/25.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-import Metal
-import MetalKit
-import Foundation
-import paddle_mobile
-import MetalPerformanceShaders
-
-class MetalHelper {
-    let device: MTLDevice
-    let queue: MTLCommandQueue
-    let textureLoader: MTKTextureLoader
-    static let shared: MetalHelper = MetalHelper.init()
-    private init(){
-        device = MTLCreateSystemDefaultDevice()!
-        queue = device.makeCommandQueue()!
-        textureLoader = MTKTextureLoader.init(device: device)
-    }
-    
-    static func scaleTexture(queue: MTLCommandQueue, input: MTLTexture, size:(width: Int, height: Int), complete: @escaping (MTLTexture) -> Void) {
-        let tmpTextureDes = MTLTextureDescriptor.init()
-        tmpTextureDes.width = size.width
-        tmpTextureDes.height = size.height
-        tmpTextureDes.depth = 1
-        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-        tmpTextureDes.pixelFormat = .rgba32Float
-        tmpTextureDes.textureType = .type2D
-        tmpTextureDes.storageMode = .shared
-        tmpTextureDes.cpuCacheMode = .defaultCache
-        let dest = MetalHelper.shared.device.makeTexture(descriptor: tmpTextureDes)
-        
-        let scale = MPSImageLanczosScale.init(device: MetalHelper.shared.device)
-        
-        let buffer = queue.makeCommandBuffer()
-        scale.encode(commandBuffer: buffer!, sourceTexture: input, destinationTexture: dest!)
-        buffer?.addCompletedHandler({ (buffer) in
-            complete(dest!)
-        })
-        buffer?.commit()
-    }
-}
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
deleted file mode 100644
index 7e1f66855e45453eee9fdbe034a309aee44ff960..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
+++ /dev/null
@@ -1,89 +0,0 @@
-//
-//  ModelHelper.swift
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-import UIKit
-import MetalKit
-import Foundation
-import paddle_mobile
-import MetalPerformanceShaders
-
-class PreProccess: CusomKernel {
-    init(device: MTLDevice) {
-        let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-        super.init(device: device, inFunctionName: "preprocess", outputDim: s, usePaddleMobileLib: false)
-    }
-}
-
-let modelHelperMap: [SupportModel : ModelHelper] = [.mobilenet : MobileNetHelper.init()]
-
-enum SupportModel: String{
-    case mobilenet = "mobilenet"
-    static func supportedModels() -> [SupportModel] {
-        return [.mobilenet]
-    }
-}
-
-protocol ModelHelper {
-    var dim: [Int] { get }
-    var modelPath: String { get }
-    var paramPath: String { get }
-    var modelDir: String { get }
-    var preprocessKernel: CusomKernel { get }
-    func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void)
-    func resultStr(res: [Float]) -> String
-}
-
-extension ModelHelper {
-    func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
-        let texture = try? MetalHelper.shared.textureLoader.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
-        MetalHelper.scaleTexture(queue: MetalHelper.shared.queue, input: texture!, size: (224, 224)) { (resTexture) in
-            getTexture(resTexture)
-        }
-    }
-}
-
-struct MobileNetHelper: ModelHelper{
-    class PreWords {
-        var contents: [String] = []
-        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
-            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
-                let string = try! String.init(contentsOfFile: filePath)
-                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
-                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
-                }
-            }else{
-                fatalError("no file call \(fileName)")
-            }
-        }
-        subscript(index: Int) -> String{
-            return contents[index]
-        }
-    }
-    let labels = PreWords.init(fileName: "synset")
-    
-    func resultStr(res: [Float]) -> String {
-        var s: [String] = []
-        res.top(r: 5).enumerated().forEach{
-            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
-        }
-        return s.joined(separator: "\n")
-    }
-    
-    var preprocessKernel: CusomKernel
-    let dim = [1, 224, 224, 3]
-    let modelPath: String
-    let paramPath: String
-    let modelDir: String
-    
-    init() {
-        modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
-        paramPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
-        modelDir = ""
-        preprocessKernel = PreProccess.init(device: MetalHelper.shared.device)
-    }
-}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
new file mode 100644
index 0000000000000000000000000000000000000000..bd07da61d0215b243372c27addf60efc3b2ad7d6
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import UIKit
+import paddle_mobile
+
+class MultiPredictViewController: UIViewController {
+  var runner1: Runner!
+  var runner2: Runner!
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
+    let genet = Genet.init(device: MetalHelper.shared.device)
+    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    let queue2 = MetalHelper.shared.device.makeCommandQueue()
+    
+    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+  }
+
+  @IBAction func predictAct(_ sender: Any) {
+    let success = self.runner2.load()
+//    DispatchQueue.global().async {
+      let image1 = UIImage.init(named: "hand.jpg")
+//      let success = self.runner2.load()
+//      if success {
+//        for i in 0..<10000 {
+//          print(i)
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result1: ")
+////            print(res)
+//          })
+//        }
+//      } else {
+//        print("load failed")
+//      }
+//      self.runner1.clear()
+//    }
+//    return
+//    DispatchQueue.global().async {
+////      sleep(1)
+//      let image1 = UIImage.init(named: "banana.jpeg")
+////      if success {
+//        for _ in 0..<10 {
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result2: ")
+//            print(res)
+//          })
+//        }
+////      } else {
+////        print("load failed")
+////      }
+////      self.runner2.clear()
+//    }
+  }
+}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
new file mode 100644
index 0000000000000000000000000000000000000000..d314e8b3f8845ef95b36b4b25e61809d353f0f24
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import MetalKit
+import Foundation
+import paddle_mobile
+
+class MetalHelper {
+  let device: MTLDevice
+  let queue: MTLCommandQueue
+  let textureLoader: MTKTextureLoader
+  static let shared: MetalHelper = MetalHelper.init()
+  private init(){
+    device = MTLCreateSystemDefaultDevice()!
+    queue = device.makeCommandQueue()!
+    textureLoader = MTKTextureLoader.init(device: device)
+  }
+  
+
+}
+
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
new file mode 100644
index 0000000000000000000000000000000000000000..a954328acae3a80643ad849d58cd6ac86bf7865e
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
@@ -0,0 +1,9 @@
+//
+//  PaddleMobile.swift
+//  paddle-mobile-demo
+//
+//  Created by liuRuiLong on 2018/9/5.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+import Foundation
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..ac07e449bc5919a37a57143aa6881f79507a45b4
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
@@ -0,0 +1,137 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void mobilenet_preprocess(
+                       texture2d<float, access::read> inTexture [[texture(0)]],
+                       texture2d<float, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_preprocess_half(
+                       texture2d<half, access::read> inTexture [[texture(0)]],
+                       texture2d<half, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_ssd_preprocess(
+                       texture2d<float, access::read> inTexture [[texture(0)]],
+                       texture2d<float, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_ssd_preprocess_half(
+                            texture2d<half, access::read> inTexture [[texture(0)]],
+                            texture2d<half, access::write> outTexture [[texture(1)]],
+                            uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void genet_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilent_ar_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilent_ar_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(input, gid);
+}
+
+kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(half4(input), gid);
+}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
deleted file mode 100644
index f359ab39ac5fbc18febfb6f0da367e72b61b959c..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
+++ /dev/null
@@ -1,44 +0,0 @@
-//
-//  PreProcessKernel.metal
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/7/20.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-#include <metal_stdlib>
-using namespace metal;
-
-
-kernel void preprocess(
-                       texture2d<float, access::read> inTexture [[texture(0)]],
-                       texture2d<float, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
-    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-kernel void preprocess_half(
-                       texture2d<half, access::read> inTexture [[texture(0)]],
-                       texture2d<half, access::write> outTexture [[texture(1)]],
-                       uint2 gid [[thread_position_in_grid]])
-{
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height()) {
-        return;
-    }
-    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
-    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
-    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
-}
-
-
-
-
-
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f9e841f9c2a3060e775726023b6d5cfc3eeb679d
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
@@ -0,0 +1,31 @@
+
+
+import Foundation
+import QuartzCore
+
+public class FPSCounter {
+  private(set) public var fps: Double = 0
+
+  var frames = 0
+  var startTime: CFTimeInterval = 0
+
+  public func start() {
+    frames = 0
+    startTime = CACurrentMediaTime()
+  }
+
+  public func frameCompleted() {
+    frames += 1
+    let now = CACurrentMediaTime()
+    let elapsed = now - startTime
+    if elapsed > 0.1 {
+      let current = Double(frames) / elapsed
+      let smoothing = 0.75
+      fps = smoothing*fps + (1 - smoothing)*current
+      if elapsed > 1 {
+        frames = 0
+        startTime = CACurrentMediaTime()
+      }
+    }
+  }
+}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
new file mode 100644
index 0000000000000000000000000000000000000000..c235ed2f0391bdc97e9e182c0e9897814a0518fa
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
@@ -0,0 +1,218 @@
+
+import UIKit
+import Metal
+import CoreVideo
+import AVFoundation
+
+@available(iOS 10.0, *)
+@objc public protocol VideoCaptureDelegate: NSObjectProtocol {
+  @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
+}
+
+/**
+  Simple interface to the iPhone's camera.
+*/
+@available(iOS 10.0, *)
+public class VideoCapture: NSObject {
+    public var previewLayer: AVCaptureVideoPreviewLayer?
+    public weak var delegate: VideoCaptureDelegate?
+    public var fps = -1
+    private let device: MTLDevice?
+    private let videoOrientation: AVCaptureVideoOrientation
+    private var textureCache: CVMetalTextureCache?
+    private let captureSession = AVCaptureSession()
+    private let videoOutput = AVCaptureVideoDataOutput()
+    private let photoOutput = AVCapturePhotoOutput()
+    private let queue = DispatchQueue(label: "net.machinethink.camera-queue")
+    private var lastTimestamp = CMTime()
+    private let cameraPosition: AVCaptureDevice.Position
+    public init(device: MTLDevice? = nil, orientation: AVCaptureVideoOrientation = .portrait, position: AVCaptureDevice.Position = .back) {
+        self.device = device
+        self.videoOrientation = orientation
+        self.cameraPosition = position
+        super.init()
+    }
+
+    public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
+                    completion: @escaping (Bool) -> Void) {
+        queue.async {
+            let success = self.setUpCamera(sessionPreset: sessionPreset)
+            DispatchQueue.main.async {
+                completion(success)
+            }
+        }
+    }
+
+    func fontCamera() -> AVCaptureDevice? {
+        let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
+        return deveices.first
+        
+    }
+    
+    func setUpCamera(sessionPreset: AVCaptureSession.Preset) -> Bool {
+        if let inDevice = device{
+            guard CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, inDevice, nil, &textureCache) == kCVReturnSuccess else {
+                print("Error: could not create a texture cache")
+                return false
+            }
+        }
+        
+        captureSession.beginConfiguration()
+        captureSession.sessionPreset = sessionPreset
+
+        var oCaptureDevice: AVCaptureDevice?
+        switch cameraPosition {
+        case .back:
+            oCaptureDevice = AVCaptureDevice.default(for: AVMediaType.video)
+            break
+        case .front:
+            oCaptureDevice = fontCamera()
+            break
+        default:
+            break
+        }
+        
+        guard let captureDevice = oCaptureDevice else {
+            print("Error: no video devices available")
+            return false
+        }
+
+        guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
+            print("Error: could not create AVCaptureDeviceInput")
+            return false
+        }
+
+        if captureSession.canAddInput(videoInput) {
+            captureSession.addInput(videoInput)
+        }
+
+        let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
+        previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
+        previewLayer.connection?.videoOrientation = self.videoOrientation
+        self.previewLayer = previewLayer
+
+        let settings: [String : Any] = [
+        kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
+        ]
+
+        videoOutput.videoSettings = settings
+        videoOutput.alwaysDiscardsLateVideoFrames = true
+        videoOutput.setSampleBufferDelegate(self, queue: queue)
+        if captureSession.canAddOutput(videoOutput) {
+            captureSession.addOutput(videoOutput)
+        }
+
+        // We want the buffers to be in portrait orientation otherwise they are
+        // rotated by 90 degrees. Need to set this _after_ addOutput()!
+        videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
+
+        if captureSession.canAddOutput(photoOutput) {
+            captureSession.addOutput(photoOutput)
+        }
+
+        captureSession.commitConfiguration()
+        return true
+    }
+
+    public func start() {
+        if !captureSession.isRunning {
+            captureSession.startRunning()
+        }
+    }
+
+    public func stop() {
+        if captureSession.isRunning {
+            captureSession.stopRunning()
+        }
+    }
+
+    /* Captures a single frame of the camera input. */
+    public func capturePhoto() {
+        let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
+        settings.previewPhotoFormat = [
+            kCVPixelBufferPixelFormatTypeKey as String: settings.__availablePreviewPhotoPixelFormatTypes[0],
+            kCVPixelBufferWidthKey as String: 480,
+            kCVPixelBufferHeightKey as String: 360,
+        ]
+        photoOutput.capturePhoto(with: settings, delegate: self)
+    }
+
+    func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
+        if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            var texture: CVMetalTexture?
+            CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache, imageBuffer, nil, .bgra8Unorm, width, height, 0, &texture)
+            if let texture = texture {
+                return CVMetalTextureGetTexture(texture)
+            }
+        }
+        return nil
+    }
+
+    func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
+        if let sampleBuffer = sampleBuffer,
+            let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            let rect = CGRect(x: 0, y: 0, width: CGFloat(width), height: CGFloat(height))
+            let ciImage = CIImage(cvPixelBuffer: imageBuffer)
+            let ciContext = CIContext(options: nil)
+            if let cgImage = ciContext.createCGImage(ciImage, from: rect) {
+                return UIImage(cgImage: cgImage)
+            }
+        }
+        return nil
+    }
+}
+
+
+@available(iOS 10.0, *)
+extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
+  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    // Because lowering the capture device's FPS looks ugly in the preview,
+    // we capture at full speed but only call the delegate at its desired
+    // framerate. If `fps` is -1, we run at the full framerate.
+    let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+    let deltaTime = timestamp - lastTimestamp
+    if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
+        lastTimestamp = timestamp
+        self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
+            let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
+            delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+        }
+    }
+  }
+
+  public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    print("dropped frame")
+  }
+}
+
+@available(iOS 10.0, *)
+extension VideoCapture: AVCapturePhotoCaptureDelegate {
+  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
+                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
+                          previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
+                          resolvedSettings: AVCaptureResolvedPhotoSettings,
+                          bracketSettings: AVCaptureBracketedStillImageSettings?,
+                          error: Error?) {
+    var imageTexture: MTLTexture?
+    var previewImage: UIImage?
+    if error == nil {
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
+            imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
+        }
+        
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
+            previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+        }
+    }
+  }
+}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
index 30fdaf078556bdc4546aec4f27e153f469d9e5ac..1c6d0a91c9bf1d202091282e43859270a238edaa 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -14,164 +14,292 @@
 
 import UIKit
 import MetalKit
+import CoreMedia
 import paddle_mobile
 import MetalPerformanceShaders
 
-let threadSupport = [1]
+var platform: Platform = .GPU
+let threadSupport: [(Platform, String)] = [(.GPU, "GPU"), (.CPU, "CPU")]
+
+//.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+let modelHelperMap: [SupportModel : Runner] = [
+                                               .genet : Runner.init(inNet: Genet.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+                                               .mobilenet_ssd_ar : Runner.init(inNet: MobileNet_ssd_AR.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform)]
+//, .genet : Genet.init()
+//let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
+
+let netSupport: [SupportModel : Net] = [.genet : Genet.init(device: MetalHelper.shared.device), .mobilenet_ssd_ar : MobileNet_ssd_AR.init(device: MetalHelper.shared.device)]
+
+enum SupportModel: String{
+  //  case mobilenet = "mobilenet"
+//  case mobilenet_ssd    = "mobilenetssd"
+  case genet            = "genet"
+  case mobilenet_ssd_ar = "mobilenetssd_ar"
+  
+  static func supportedModels() -> [SupportModel] {
+    // .mobilenet,
+    // .mobilenet_ssd,
+    return [.genet, .mobilenet_ssd_ar]
+  }
+}
 
 class ViewController: UIViewController {
-    @IBOutlet weak var resultTextView: UITextView!
-    @IBOutlet weak var selectImageView: UIImageView!
-    @IBOutlet weak var elapsedTimeLabel: UILabel!
-    @IBOutlet weak var modelPickerView: UIPickerView!
-    @IBOutlet weak var threadPickerView: UIPickerView!
-    var selectImage: UIImage?
-    var program: Program?
-    var executor: Executor<Float32>?
-    var modelType: SupportModel = .mobilenet
-    var toPredictTexture: MTLTexture?
-    var modelHelper: ModelHelper {
-        return modelHelperMap[modelType] ?! " has no this type "
-    }
-    var threadNum = 1
+  @IBOutlet weak var resultTextView: UITextView!
+  @IBOutlet weak var selectImageView: UIImageView!
+  @IBOutlet weak var elapsedTimeLabel: UILabel!
+  @IBOutlet weak var modelPickerView: UIPickerView!
+  @IBOutlet weak var threadPickerView: UIPickerView!
+  @IBOutlet weak var videoView: UIView!
+//  var videoCapture: VideoCapture!
+
+  var selectImage: UIImage?
+  var inputPointer: UnsafeMutablePointer<Float32>?
+  var modelType: SupportModel = SupportModel.supportedModels()[0]
+  var toPredictTexture: MTLTexture?
+  
+  var runner: Runner!
+  
+  var threadNum = 1
+  
+  @IBAction func loadAct(_ sender: Any) {
+     runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue, inPlatform: platform)
     
-    @IBAction func loadAct(_ sender: Any) {
-        let inModelHelper = modelHelper
-        let queue = MetalHelper.shared.queue
-        let loader = Loader<Float32>.init()
-        do {
-            let modelPath = inModelHelper.modelPath
-            let paraPath = inModelHelper.paramPath
-            
-            program = try loader.load(device: MetalHelper.shared.device, modelPath: modelPath, paraPath: paraPath)
-            executor = try Executor<Float32>.init(inDevice: MetalHelper.shared.device, inQueue: queue, inProgram: program!)
-        } catch let error {
-            print(error)
+    if platform == .CPU {
+      if inputPointer == nil {
+        inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+       
+      }
+    } else if platform == .GPU {
+      if self.toPredictTexture == nil {
+        runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+          self?.toPredictTexture = texture
         }
+      }
+    } else {
+      fatalError( " unsupport " )
     }
     
-    @IBAction func selectImageAct(_ sender: Any) {
-        let imagePicker = UIImagePickerController()
-        imagePicker.sourceType = .camera
-        imagePicker.delegate = self
-        self.present(imagePicker, animated: true, completion: nil)
-    }
-    
-    @IBAction func clearAct(_ sender: Any) {
-        executor?.clear()
-        program = nil
-        executor = nil
-        
+    if runner.load() {
+      print(" load success ! ")
+    } else {
+      print(" load error ! ")
     }
-    
-    @IBAction func predictAct(_ sender: Any) {        
-        guard let inTexture = toPredictTexture else {
-            resultTextView.text = "请选择图片 ! "
-            return
+  }
+  
+  @IBAction func selectImageAct(_ sender: Any) {
+    let imagePicker = UIImagePickerController()
+    imagePicker.sourceType = .camera
+    imagePicker.delegate = self
+    self.present(imagePicker, animated: true, completion: nil)
+  }
+  
+  @IBAction func clearAct(_ sender: Any) {
+    runner.clear()
+  }
+  
+  @IBAction func predictAct(_ sender: Any) {
+    let max = 50
+    switch platform {
+    case .GPU:
+      guard let inTexture = toPredictTexture else {
+        resultTextView.text = "请选择图片 ! "
+        return
+      }
+      
+      for _ in 0..<10{
+        runner.predict(texture: inTexture) { (success, resultHolder)  in
+          resultHolder?.releasePointer()
+        }
+      }
+      
+      let startDate = Date.init()
+      for i in 0..<max {
+        runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
+          guard let sSelf = self else {
+            fatalError()
+          }
+          if success {
+            if i == max - 1 {
+              let time = Date.init().timeIntervalSince(startDate)
+              DispatchQueue.main.async {
+//                print(resultHolder!.result![0])
+                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
+                
+                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+               
+              }
+            }
+          }
+          
+          DispatchQueue.main.async {
+            resultHolder?.releasePointer()
+          }
+//            print("释放")
         }
-        
-        guard let inExecutor = executor else {
-            resultTextView.text = "请先 load ! "
-            return
+//        print("sleep before ")
+//        usleep(33000)
+//        print("sleep after ")
+      }
+    case .CPU:
+      guard let inInputPointer = inputPointer else {
+        fatalError( " need input pointer " )
+      }
+      
+      for _ in 0..<10 {
+        runner.predict(inputPointer: inInputPointer) { (success, res) in
+          res?.releaseOutput()
         }
-
-        do {
-            let max = 100
-            var startDate = Date.init()
-            for i in 0..<max {
-                try inExecutor.predict(input: inTexture, expect: modelHelper.dim, completionHandle: { [weak self] (result) in
-                    guard let sSelf = self else {
-                        fatalError()
-                    }
-                    
-                    if i == (max / 2 - 1) {
-                        startDate = Date.init()
-                    }
-                    
-                    if i == max - 1 {
-                        let time = Date.init().timeIntervalSince(startDate)
-                        DispatchQueue.main.async {
-                            sSelf.resultTextView.text = sSelf.modelHelper.resultStr(res: result.resultArr)
-                            sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max/2) * 1000.0) ms"
-                        }
-                    }
-                }, preProcessKernle: self.modelHelper.preprocessKernel)
+      }
+      
+      let startDate = Date.init()
+      for i in 0..<max {
+        runner.predict(inputPointer: inInputPointer) { [weak self](success, res) in
+          guard let sSelf = self else {
+            fatalError()
+          }
+          if success {
+            if i == max - 1 {
+              let time = Date.init().timeIntervalSince(startDate)
+              DispatchQueue.main.async {
+//                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
+                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+              }
             }
-        } catch let error {
-            print(error)
+          }
+          res?.releaseOutput()
         }
+      }
     }
+  }
+  
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    
+//    if runner.load() {
+//      print(" load success ! ")
+//    } else {
+//      print(" load error ! ")
+//    }
+//    
+    modelPickerView.delegate = self
+    modelPickerView.dataSource = self
+    threadPickerView.delegate = self
+    threadPickerView.dataSource = self
+    
+    selectImage = UIImage.init(named: "hand.jpg")
+    selectImageView.image = selectImage
+    
+//    if platform == .CPU {
+//      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+//    } else if platform == .GPU {
+//      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+//        self?.toPredictTexture = texture
+//      }
+//    } else {
+//      fatalError( " unsupport " )
+//    }
+    
+//    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
+//    videoCapture.fps = 30
+//    videoCapture.delegate = self
+//    videoCapture.setUp { (success) in
+//      DispatchQueue.main.async {
+//        if let preViewLayer = self.videoCapture.previewLayer {
+//          self.videoView.layer.addSublayer(preViewLayer)
+//          self.videoCapture.previewLayer?.frame = self.videoView.bounds
+//        }
+//        self.videoCapture.start()
+//      }
+//    }
 
-    override func viewDidLoad() {
-        super.viewDidLoad()
-        modelPickerView.delegate = self
-        modelPickerView.dataSource = self
-        threadPickerView.delegate = self
-        threadPickerView.dataSource = self
-        
-        selectImage = UIImage.init(named: "banana.jpeg")
-        selectImageView.image = selectImage
-        modelHelper.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
-            self?.toPredictTexture = texture
-        }
-    }
+  }
 }
 
 extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
-    func numberOfComponents(in pickerView: UIPickerView) -> Int {
-        if pickerView == modelPickerView {
-            return 1
-        } else if pickerView == threadPickerView {
-            return 1
-        } else {
-            fatalError()
-        }
+  func numberOfComponents(in pickerView: UIPickerView) -> Int {
+    if pickerView == modelPickerView {
+      return 1
+    } else if pickerView == threadPickerView {
+      return 1
+    } else {
+      fatalError()
     }
-    
-    func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
-        if pickerView == modelPickerView {
-            return SupportModel.supportedModels().count
-        } else if pickerView == threadPickerView {
-            return threadSupport.count
-        } else {
-            fatalError()
-        }
+  }
+  
+  func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
+    if pickerView == modelPickerView {
+      return SupportModel.supportedModels().count
+    } else if pickerView == threadPickerView {
+      return threadSupport.count
+    } else {
+      fatalError()
     }
-    
-    public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
-        if pickerView == modelPickerView {
-            return SupportModel.supportedModels()[row].rawValue
-        } else if pickerView == threadPickerView {
-            return "\(threadSupport[row])"
-        } else {
-            fatalError()
-        }
+  }
+  
+  public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
+    if pickerView == modelPickerView {
+      return SupportModel.supportedModels()[row].rawValue
+    } else if pickerView == threadPickerView {
+      return threadSupport[row].1
+    } else {
+      fatalError()
     }
-    
-    public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
-        if pickerView == modelPickerView {
-            self.modelType = SupportModel.supportedModels()[row]
-        } else if pickerView == threadPickerView {
-            self.threadNum = threadSupport[row]
-        } else {
-            fatalError()
-        }
+  }
+  
+  public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
+    if pickerView == modelPickerView {
+      self.modelType = SupportModel.supportedModels()[row]
+    } else if pickerView == threadPickerView {
+      
+      platform = threadSupport[row].0
+    } else {
+      fatalError()
     }
+  }
 }
 
 extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
-    func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
-        picker.dismiss(animated: true){[weak self] in
-            guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
-                fatalError("no image")
-            }
-            sSelf.selectImage = image
-            sSelf.selectImageView.image = image
-            sSelf.modelHelper.getTexture(image: image.cgImage!, getTexture: { (texture) in
-                sSelf.toPredictTexture = texture
-            })
-        }
+  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+    picker.dismiss(animated: true){[weak self] in
+      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
+        fatalError("no image")
+      }
+      sSelf.selectImage = image
+      sSelf.selectImageView.image = image
+      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+        sSelf.toPredictTexture = texture
+      })
     }
+  }
 }
 
+var bool1 = false
+extension ViewController: VideoCaptureDelegate{
+  func predictTexture(texture: MTLTexture){
+    runner.scaleTexture(input: texture) { (scaledTexture) in
+      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
+//        print(resultHolder!.result![0])
+        resultHolder?.releasePointer()
+      })
+    }
+  }
+  
+  
+//  @available(iOS 10.0, *)
+//  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
+////    if !bool1 {
+////      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
+//    self.predictTexture(texture: texture!)
+////      }
+//
+//
+////      bool1 = true
+////    }
+//
+//  }
+
+}
+
+
+
 
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h b/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
new file mode 100644
index 0000000000000000000000000000000000000000..92de82860ccd372ba0eae962edd1b271986f1862
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
@@ -0,0 +1,5 @@
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#import <paddle_mobile/paddle_mobile.h>
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
index 6ab6f7c05e30049e850170409efcd6f049c73abe..7817befaedf1aff04b75abd39cc6f7f06bc935d3 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
@@ -1,10 +1,16 @@
-//
-//  AppDelegate.swift
-//  paddle-mobile-unit-test
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
 
 import UIKit
 
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
index d57b610e4d10f02d2eace4892a6d55eda8f2c9b9..98f03affa2a230b2698edf6bafe5e06def8986b6 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
@@ -1,18 +1,34 @@
-//
-//  ViewController.swift
-//  paddle-mobile-unit-test
-//
-//  Created by liuRuiLong on 2018/8/10.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
 
 import UIKit
+import Metal
+//import MetalKit
 import paddle_mobile
 
 class ViewController: UIViewController {
-
     override func viewDidLoad() {
         super.viewDidLoad()
+        let device = Metal.MTLCreateSystemDefaultDevice()!
+        let queue = device.makeCommandQueue()!
+        let test = PaddleMobileUnitTest.init(
+            inDevice: device,
+            inQueue: queue
+        )
+        test.testConcat()
+//        test.testReshape()
+//        test.testTranspose()
         print(" done ")
     }
 
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
index 6bceab43210c42ef83a2152463caf3bc8917b8c8..34d45528542d0d6a9d5ac153a7d6f818d962cbfd 100644
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -7,7 +7,31 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
+		4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; };
+		4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */; };
+		4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA892146631C00D0F791 /* BilinearInterp.metal */; };
+		4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8B2146640900D0F791 /* SplitOp.swift */; };
+		4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */; };
+		4AA1EA90214664CD00D0F791 /* Split.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8F214664CD00D0F791 /* Split.metal */; };
+		4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA91214665D700D0F791 /* ShapeOp.swift */; };
+		4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA932146661500D0F791 /* ShapeKernel.swift */; };
+		4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA972146666500D0F791 /* FlattenOp.swift */; };
+		4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */ = {isa = PBXBuildFile; fileRef = 4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */; };
+		4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */; };
+		4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */; };
+		4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */; };
+		4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA5214B5F6800D0F791 /* Shape.metal */; };
+		4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */; };
+		4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */; };
+		4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */; };
+		4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */; };
+		4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928762133F1DB005B6C3A /* BoxCoder.metal */; };
+		4AF9287921341661005B6C3A /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9287821341661005B6C3A /* Softmax.metal */; };
+		4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* ConcatKernel.metal */; };
+		4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9288321357BE3005B6C3A /* Elementwise.metal */; };
 		D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */; };
+		FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226552138F33800F395E2 /* TransposeKernel.metal */; };
+		FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226572138F38D00F395E2 /* PoolKernel.metal */; };
 		FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */ = {isa = PBXBuildFile; fileRef = FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9420E11C9A0081E9F8 /* Extensions.swift */; };
 		FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9520E11C9A0081E9F8 /* Errors.swift */; };
@@ -35,17 +59,54 @@
 		FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; };
 		FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
 		FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; };
-		FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1B186520ECF1C600678B91 /* ResizeKernel.swift */; };
+		FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */ = {isa = PBXBuildFile; fileRef = FC292C5521421B4600CF622F /* PaddleMobileGPU.m */; };
+		FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7C214255BC00CF622F /* CPUCompute.mm */; };
+		FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7E214255BC00CF622F /* MobileNetSSD.swift */; };
+		FC292C85214257CB00CF622F /* CPUCompute.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C7D214255BC00CF622F /* CPUCompute.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC292C872142624800CF622F /* Genet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C862142624800CF622F /* Genet.swift */; };
+		FC33B0F02147659000714A93 /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC33B0EF2147659000714A93 /* MobileNet.swift */; };
 		FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
 		FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
 		FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
+		FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */; };
+		FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD9782140E4980073E130 /* libpaddle-mobile.a */; };
+		FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD97D2140F2C30073E130 /* libstdc++.tbd */; };
 		FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; };
 		FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC60DB8820E9AAA500FF203F /* MetalExtension.swift */; };
+		FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */; };
+		FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */; };
+		FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */; };
+		FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */; };
+		FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC6214CBA820094B8E5 /* Macro.metal */; };
+		FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */; };
 		FC82735920E3C04200BE430A /* OpCreator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC82735820E3C04200BE430A /* OpCreator.swift */; };
+		FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */; };
 		FC9D037920E229E4000F735A /* OpParam.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037820E229E4000F735A /* OpParam.swift */; };
 		FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037F20E22FBB000F735A /* FeedOp.swift */; };
 		FC9D038220E2312E000F735A /* FetchOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038120E2312E000F735A /* FetchOp.swift */; };
 		FC9D038420E23B01000F735A /* Texture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038320E23B01000F735A /* Texture.swift */; };
+		FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */; };
+		FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1642132A5EB00084FE5 /* Common.metal */; };
+		FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */; };
+		FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD42138272900BD58AA /* ConvAddMetal.metal */; };
+		FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */; };
+		FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */; };
+		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
+		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
+		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
+		FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */; };
+		FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */; };
+		FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */; };
+		FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */; };
+		FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC642122FCD700D94F7E /* TransposeOp.swift */; };
+		FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC66212306B000D94F7E /* ConcatOp.swift */; };
+		FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC68212306D300D94F7E /* ConcatKernel.swift */; };
+		FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */; };
+		FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */; };
+		FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */; };
+		FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */; };
 		FCD04E6620F314C50007374F /* PoolOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6520F314C50007374F /* PoolOp.swift */; };
 		FCD04E6820F315020007374F /* PoolKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6720F315020007374F /* PoolKernel.swift */; };
 		FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6920F319EC0007374F /* SoftmaxOp.swift */; };
@@ -55,15 +116,55 @@
 		FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7120F343420007374F /* ConvAddOp.swift */; };
 		FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7320F3437E0007374F /* ConvAddKernel.swift */; };
 		FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDC0FEA21099A1D00DC9EFB /* Tools.swift */; };
+		FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */; };
+		FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */; };
+		FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */; };
+		FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */; };
+		FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */; };
+		FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; };
+		FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */; };
+		FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */; };
+		FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */; };
+		FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */; };
+		FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */; };
+		FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */; };
+		FCE9D7B7214F869000B520C3 /* Net.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B6214F869000B520C3 /* Net.swift */; };
+		FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */; };
+		FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; };
+		FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; };
 		FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */; };
 		FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */; };
 		FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2D73720E64E70007AC5F5 /* Kernel.swift */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
+		4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = "<group>"; };
+		4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpKernel.swift; sourceTree = "<group>"; };
+		4AA1EA892146631C00D0F791 /* BilinearInterp.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.metal; sourceTree = "<group>"; };
+		4AA1EA8B2146640900D0F791 /* SplitOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitOp.swift; sourceTree = "<group>"; };
+		4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitKernel.swift; sourceTree = "<group>"; };
+		4AA1EA8F214664CD00D0F791 /* Split.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.metal; sourceTree = "<group>"; };
+		4AA1EA91214665D700D0F791 /* ShapeOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeOp.swift; sourceTree = "<group>"; };
+		4AA1EA932146661500D0F791 /* ShapeKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeKernel.swift; sourceTree = "<group>"; };
+		4AA1EA972146666500D0F791 /* FlattenOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenOp.swift; sourceTree = "<group>"; };
+		4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ConcatKernel.inc.metal; sourceTree = "<group>"; };
+		4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ReshapeKernel.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenKernel.swift; sourceTree = "<group>"; };
+		4AA1EAA3214A295C00D0F791 /* Split.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA5214B5F6800D0F791 /* Shape.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shape.metal; sourceTree = "<group>"; };
+		4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.inc.metal; sourceTree = "<group>"; };
+		4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.inc.metal; sourceTree = "<group>"; };
+		4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.inc.metal; sourceTree = "<group>"; };
+		4AF928762133F1DB005B6C3A /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; };
+		4AF9287821341661005B6C3A /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = "<group>"; };
+		4AF928812135673D005B6C3A /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = "<group>"; };
+		4AF9288321357BE3005B6C3A /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
 		CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.debug.xcconfig"; sourceTree = "<group>"; };
 		DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.release.xcconfig"; sourceTree = "<group>"; };
+		FC0226552138F33800F395E2 /* TransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
+		FC0226572138F38D00F395E2 /* PoolKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PoolKernel.metal; sourceTree = "<group>"; };
 		FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = paddle_mobile.h; sourceTree = "<group>"; };
 		FC039B6E20E11C3C0081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
@@ -93,17 +194,54 @@
 		FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; };
 		FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; };
 		FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; };
-		FC1B186520ECF1C600678B91 /* ResizeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ResizeKernel.swift; sourceTree = "<group>"; };
+		FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileGPU.h; sourceTree = "<group>"; };
+		FC292C5521421B4600CF622F /* PaddleMobileGPU.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PaddleMobileGPU.m; sourceTree = "<group>"; };
+		FC292C7C214255BC00CF622F /* CPUCompute.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = CPUCompute.mm; sourceTree = "<group>"; };
+		FC292C7D214255BC00CF622F /* CPUCompute.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CPUCompute.h; sourceTree = "<group>"; };
+		FC292C7E214255BC00CF622F /* MobileNetSSD.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileNetSSD.swift; sourceTree = "<group>"; };
+		FC292C862142624800CF622F /* Genet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Genet.swift; sourceTree = "<group>"; };
+		FC33B0EF2147659000714A93 /* MobileNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
 		FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; };
 		FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
 		FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProgramOptimize.swift; sourceTree = "<group>"; };
+		FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PaddleMobile.swift; sourceTree = "<group>"; };
+		FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileCPU.h; sourceTree = "<group>"; };
+		FC4FD9782140E4980073E130 /* libpaddle-mobile.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = "libpaddle-mobile.a"; sourceTree = "<group>"; };
+		FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
 		FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = "<group>"; };
 		FC60DB8820E9AAA500FF203F /* MetalExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalExtension.swift; sourceTree = "<group>"; };
+		FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluOp.swift; sourceTree = "<group>"; };
+		FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluKernel.swift; sourceTree = "<group>"; };
+		FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPreluKernel.metal; sourceTree = "<group>"; };
+		FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPrelu.inc.metal; sourceTree = "<group>"; };
+		FC803BC6214CBA820094B8E5 /* Macro.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = "<group>"; };
+		FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = "<group>"; };
 		FC82735820E3C04200BE430A /* OpCreator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpCreator.swift; sourceTree = "<group>"; };
+		FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobilenetSSD_AR.swift; sourceTree = "<group>"; };
 		FC9D037820E229E4000F735A /* OpParam.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpParam.swift; sourceTree = "<group>"; };
 		FC9D037F20E22FBB000F735A /* FeedOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeedOp.swift; sourceTree = "<group>"; };
 		FC9D038120E2312E000F735A /* FetchOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchOp.swift; sourceTree = "<group>"; };
 		FC9D038320E23B01000F735A /* Texture.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture.swift; sourceTree = "<group>"; };
+		FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
+		FCA3A1642132A5EB00084FE5 /* Common.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = "<group>"; };
+		FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = "<group>"; };
+		FCA67CD42138272900BD58AA /* ConvAddMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
+		FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = "<group>"; };
+		FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = "<group>"; };
+		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
+		FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthwiseConvOp.swift; sourceTree = "<group>"; };
+		FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxOp.swift; sourceTree = "<group>"; };
+		FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxKernel.swift; sourceTree = "<group>"; };
+		FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeKernel.swift; sourceTree = "<group>"; };
+		FCBCCC642122FCD700D94F7E /* TransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeOp.swift; sourceTree = "<group>"; };
+		FCBCCC66212306B000D94F7E /* ConcatOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatOp.swift; sourceTree = "<group>"; };
+		FCBCCC68212306D300D94F7E /* ConcatKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderOp.swift; sourceTree = "<group>"; };
+		FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSOp.swift; sourceTree = "<group>"; };
+		FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSKernel.swift; sourceTree = "<group>"; };
 		FCD04E6520F314C50007374F /* PoolOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolOp.swift; sourceTree = "<group>"; };
 		FCD04E6720F315020007374F /* PoolKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolKernel.swift; sourceTree = "<group>"; };
 		FCD04E6920F319EC0007374F /* SoftmaxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxOp.swift; sourceTree = "<group>"; };
@@ -113,9 +251,25 @@
 		FCD04E7120F343420007374F /* ConvAddOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddOp.swift; sourceTree = "<group>"; };
 		FCD04E7320F3437E0007374F /* ConvAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddKernel.swift; sourceTree = "<group>"; };
 		FCDC0FEA21099A1D00DC9EFB /* Tools.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tools.swift; sourceTree = "<group>"; };
+		FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = BatchNormKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
+		FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = "<group>"; };
+		FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
+		FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.metal; sourceTree = "<group>"; };
+		FCE9D7B6214F869000B520C3 /* Net.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Net.swift; sourceTree = "<group>"; };
+		FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = "<group>"; };
+		FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
+		FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = "<group>"; };
 		FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = ConvAddBatchNormReluOp.swift; path = "paddle-mobile/Operators/ConvAddBatchNormReluOp.swift"; sourceTree = SOURCE_ROOT; };
 		FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluKernel.swift; sourceTree = "<group>"; };
-		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Kernel.swift"; sourceTree = SOURCE_ROOT; };
+		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Base/Kernel.swift"; sourceTree = SOURCE_ROOT; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -123,7 +277,9 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */,
 				D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */,
+				FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -133,6 +289,7 @@
 		336CBE234BF5DE48658DE65F /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				FC4FD97D2140F2C30073E130 /* libstdc++.tbd */,
 				DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */,
 			);
 			name = Frameworks;
@@ -168,10 +325,19 @@
 		FC039B6C20E11C3C0081E9F8 /* paddle-mobile */ = {
 			isa = PBXGroup;
 			children = (
+				FCE9D7B6214F869000B520C3 /* Net.swift */,
+				FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */,
+				FC33B0EF2147659000714A93 /* MobileNet.swift */,
+				FC292C862142624800CF622F /* Genet.swift */,
+				FC292C7E214255BC00CF622F /* MobileNetSSD.swift */,
+				FC292C7C214255BC00CF622F /* CPUCompute.mm */,
+				FC292C7D214255BC00CF622F /* CPUCompute.h */,
+				FC292C5521421B4600CF622F /* PaddleMobileGPU.m */,
+				FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */,
+				FC4FD9762140E4920073E130 /* CPU */,
+				FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */,
 				FC039BAE20E11CC20081E9F8 /* Program */,
 				FC039BA320E11CBC0081E9F8 /* Operators */,
-				FC039BA120E11CB70081E9F8 /* Loader.swift */,
-				FC039B9A20E11CA00081E9F8 /* Executor.swift */,
 				FC039B9C20E11CB20081E9F8 /* framework */,
 				FC039B9320E11C9A0081E9F8 /* Common */,
 				FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */,
@@ -196,6 +362,8 @@
 		FC039B9C20E11CB20081E9F8 /* framework */ = {
 			isa = PBXGroup;
 			children = (
+				FC039BA120E11CB70081E9F8 /* Loader.swift */,
+				FC039B9A20E11CA00081E9F8 /* Executor.swift */,
 				FC039B9D20E11CB20081E9F8 /* Tensor.swift */,
 				FC039B9E20E11CB20081E9F8 /* Dim.swift */,
 				FC9D038320E23B01000F735A /* Texture.swift */,
@@ -219,6 +387,23 @@
 				FCD04E6920F319EC0007374F /* SoftmaxOp.swift */,
 				FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */,
 				FCD04E7120F343420007374F /* ConvAddOp.swift */,
+				FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */,
+				FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */,
+				FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */,
+				FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */,
+				FCBCCC642122FCD700D94F7E /* TransposeOp.swift */,
+				FCBCCC66212306B000D94F7E /* ConcatOp.swift */,
+				FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */,
+				4AA1EA8B2146640900D0F791 /* SplitOp.swift */,
+				4AA1EA91214665D700D0F791 /* ShapeOp.swift */,
+				4AA1EA972146666500D0F791 /* FlattenOp.swift */,
+				4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */,
+				FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */,
+				FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */,
+				FCEB684B212F093800D2448E /* PreluOp.swift */,
+				FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
+				FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */,
+				FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */,
 			);
 			path = Operators;
 			sourceTree = "<group>";
@@ -243,24 +428,46 @@
 		FC086BA520E67E8500D85EF7 /* Kernels */ = {
 			isa = PBXGroup;
 			children = (
+				FCDDC6CD212FE02100E5EF74 /* Base */,
+				FCEB6837212F00B100D2448E /* metal */,
+				FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
 				FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
-				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
-				FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
-				FC1B186520ECF1C600678B91 /* ResizeKernel.swift */,
 				FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
 				FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
 				FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
 				FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */,
-				FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
 				FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */,
 				FCD04E6720F315020007374F /* PoolKernel.swift */,
 				FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */,
 				FCD04E6F20F31B720007374F /* ReshapeKernel.swift */,
+				4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */,
 				FCD04E7320F3437E0007374F /* ConvAddKernel.swift */,
+				FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */,
+				FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */,
+				FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */,
+				FCBCCC68212306D300D94F7E /* ConcatKernel.swift */,
+				FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */,
+				4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */,
+				4AA1EA932146661500D0F791 /* ShapeKernel.swift */,
+				4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */,
+				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
+				FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */,
+				FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */,
+				FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */,
+				FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
 		};
+		FC4FD9762140E4920073E130 /* CPU */ = {
+			isa = PBXGroup;
+			children = (
+				FC4FD9782140E4980073E130 /* libpaddle-mobile.a */,
+				FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */,
+			);
+			path = CPU;
+			sourceTree = "<group>";
+		};
 		FCD592FA20E248EC00252966 /* Base */ = {
 			isa = PBXGroup;
 			children = (
@@ -271,6 +478,56 @@
 			path = Base;
 			sourceTree = "<group>";
 		};
+		FCDDC6CD212FE02100E5EF74 /* Base */ = {
+			isa = PBXGroup;
+			children = (
+				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
+			);
+			path = Base;
+			sourceTree = "<group>";
+		};
+		FCEB6837212F00B100D2448E /* metal */ = {
+			isa = PBXGroup;
+			children = (
+				4AF928812135673D005B6C3A /* ConcatKernel.metal */,
+				4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */,
+				4AF9288321357BE3005B6C3A /* Elementwise.metal */,
+				FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
+				FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
+				4AF928762133F1DB005B6C3A /* BoxCoder.metal */,
+				4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */,
+				4AA1EAA5214B5F6800D0F791 /* Shape.metal */,
+				4AA1EA8F214664CD00D0F791 /* Split.metal */,
+				4AA1EAA3214A295C00D0F791 /* Split.inc.metal */,
+				4AA1EA892146631C00D0F791 /* BilinearInterp.metal */,
+				4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */,
+				4AF9287821341661005B6C3A /* Softmax.metal */,
+				4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */,
+				FCEB6849212F00DB00D2448E /* PreluKernel.metal */,
+				FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */,
+				FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */,
+				FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */,
+				FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */,
+				4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */,
+				FCA3A1642132A5EB00084FE5 /* Common.metal */,
+				FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */,
+				FCA67CD42138272900BD58AA /* ConvAddMetal.metal */,
+				FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */,
+				FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */,
+				FC0226552138F33800F395E2 /* TransposeKernel.metal */,
+				4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */,
+				FC0226572138F38D00F395E2 /* PoolKernel.metal */,
+				FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */,
+				FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */,
+				FC803BC6214CBA820094B8E5 /* Macro.metal */,
+				FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */,
+				FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */,
+				FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */,
+				FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */,
+			);
+			path = metal;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 
 /* Begin PBXHeadersBuildPhase section */
@@ -278,6 +535,10 @@
 			isa = PBXHeadersBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */,
+				FC292C85214257CB00CF622F /* CPUCompute.h in Headers */,
+				FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */,
+				4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */,
 				FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -315,6 +576,7 @@
 				TargetAttributes = {
 					FC039B6920E11C3C0081E9F8 = {
 						CreatedOnToolsVersion = 9.3.1;
+						LastSwiftMigration = 0940;
 					};
 				};
 			};
@@ -372,53 +634,124 @@
 			buildActionMask = 2147483647;
 			files = (
 				FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
+				4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */,
 				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
+				FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */,
+				FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */,
+				4AF9287921341661005B6C3A /* Softmax.metal in Sources */,
+				4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */,
 				FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
 				FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
+				FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */,
+				FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
+				4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */,
+				4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */,
+				FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */,
 				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
+				4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */,
 				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
+				FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */,
 				FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */,
 				FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */,
+				FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */,
 				FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
 				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
-				FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
 				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
+				FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
+				FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
+				FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
+				FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */,
+				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
+				4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */,
+				FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */,
 				FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
+				4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
+				4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */,
+				FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
+				FC33B0F02147659000714A93 /* MobileNet.swift in Sources */,
+				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
+				4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */,
+				FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */,
 				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
 				FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */,
+				4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */,
+				FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */,
 				FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */,
 				FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */,
 				FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */,
 				FC039BB820E11CC20081E9F8 /* framework.pb.swift in Sources */,
 				FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */,
 				FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */,
+				FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */,
+				FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */,
+				FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
 				FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
 				FC9D038420E23B01000F735A /* Texture.swift in Sources */,
+				FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */,
+				4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */,
+				4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */,
+				FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
+				4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */,
 				FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */,
 				FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */,
 				FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */,
+				4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */,
 				FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */,
 				FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */,
+				FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */,
 				FCD04E6620F314C50007374F /* PoolOp.swift in Sources */,
+				FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */,
 				FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */,
+				FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
 				FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */,
+				FC292C872142624800CF622F /* Genet.swift in Sources */,
+				FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */,
+				4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */,
+				FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
+				FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */,
 				FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */,
 				FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */,
+				4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */,
+				FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */,
+				FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */,
+				FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */,
 				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
+				FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */,
+				4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */,
+				FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
+				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
+				FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */,
+				FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
 				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
 				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
 				FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */,
+				FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */,
+				FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */,
+				FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */,
 				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
+				FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */,
 				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
 				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
+				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
 				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
+				FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */,
+				4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */,
+				FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */,
+				FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */,
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
+				FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */,
 				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
+				FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */,
 				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
+				FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */,
+				4AA1EA90214664CD00D0F791 /* Split.metal in Sources */,
 				FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
+				FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */,
 				FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
+				FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
 				FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */,
+				4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */,
 				FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -550,6 +883,7 @@
 			isa = XCBuildConfiguration;
 			baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */;
 			buildSettings = {
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
@@ -557,6 +891,7 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
@@ -565,10 +900,16 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/paddle-mobile/CPU",
+				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				SKIP_INSTALL = YES;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 4.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -578,6 +919,7 @@
 			isa = XCBuildConfiguration;
 			baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */;
 			buildSettings = {
+				CLANG_ENABLE_MODULES = YES;
 				CODE_SIGN_IDENTITY = "";
 				CODE_SIGN_STYLE = Automatic;
 				DEFINES_MODULE = YES;
@@ -585,6 +927,7 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = "paddle-mobile/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
@@ -593,6 +936,11 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/paddle-mobile/CPU",
+				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
diff --git a/src/ios_io/PaddleMobile.h b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
similarity index 55%
rename from src/ios_io/PaddleMobile.h
rename to metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
index 5854c5c3a4d4c899feb88822b2f7993860d1ed76..c68d81f328f4ce9a9bf16624f677b2996644c35c 100644
--- a/src/ios_io/PaddleMobile.h
+++ b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
@@ -17,7 +17,17 @@
 #import <CoreImage/CoreImage.h>
 #import <Foundation/Foundation.h>
 
-@interface PaddleMobile : NSObject
+@interface PaddleMobileCPUResult: NSObject
+
+@property (assign, nonatomic, readonly) float *output;
+
+@property (assign, nonatomic, readonly) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface PaddleMobileCPU : NSObject
 
 /*
     创建对象
@@ -34,13 +44,36 @@
 */
 - (BOOL)load:(NSString *)modelAndWeightPath;
 
+/*
+ * 从内存中加载模型
+ * */
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
+
+/*
+ *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
+ * */
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim;
+
+/*
+ * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ * */
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                                    dim:(NSArray<NSNumber *> *)dim;
+
 /*
     进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
 */
 - (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
 
 /*
-    进行预测
+    进行预测, 默认 means 为 0, scale 为 1.0
 */
 - (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
 
diff --git a/metal/paddle-mobile/paddle-mobile/CPUCompute.h b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed12dd60df4ea06944fdf4ff9b635fc12a99120e
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <Foundation/Foundation.h>
+
+
+@interface CPUResult: NSObject
+@property (assign, nonatomic) float *output;
+@property (assign, nonatomic) int outputSize;
+@end
+
+@interface NMSCompute: NSObject
+
+@property (assign, nonatomic) float scoreThredshold;
+
+@property (assign, nonatomic) int nmsTopK;
+
+@property (assign, nonatomic) int keepTopK;
+
+@property (assign, nonatomic) float nmsEta;
+
+@property (assign, nonatomic) float nmsThreshold;
+
+@property (assign, nonatomic) int background_label;
+
+@property (strong, nonatomic) NSArray<NSNumber *> *scoreDim;
+
+@property (strong, nonatomic) NSArray<NSNumber *> *bboxDim;
+
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
+
+@end
diff --git a/metal/paddle-mobile/paddle-mobile/CPUCompute.mm b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
new file mode 100644
index 0000000000000000000000000000000000000000..b97153765b46bb63d604d8845eee08d91283481d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
@@ -0,0 +1,322 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+#import "CPUCompute.h"
+
+#import <map>
+#import <vector>
+#import <utility>
+#import <algorithm>
+
+
+
+
+struct NMSParam {
+  
+  float *score_data;
+  
+  float *box_data;
+  
+  float *output;
+  
+  int output_size;
+  
+  std::vector<int> score_dim;
+  
+  std::vector<int> box_dim;
+  
+  float scoreThredshold;
+  
+  int nmsTopK;
+  
+  int keepTopK;
+  
+  float nmsEta;
+  
+  float nmsThreshold;
+  
+  int background_label;
+};
+
+
+constexpr int kOutputDim = 6;
+constexpr int kBBoxSize = 4;
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+                                    const std::vector<T>& scores, const T threshold, int top_k,
+                                    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+static inline void NMSFast(
+                           const T *bbox_data,
+                           std::vector<int> bbox_dim,
+                           const T *score_data,
+                           const T score_threshold, const T nms_threshold,
+                           const T eta, const int top_k,
+                           std::vector<int>* selected_indices) {
+  // The total boxes for each instance.
+  int num_boxes = bbox_dim[0];
+  // 4: [xmin ymin xmax ymax]
+  int box_size = bbox_dim[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(score_data, num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, true);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const T *boxes_data,
+                   const std::vector<int> &box_dim,
+                   const T *scores_data,
+                   const std::vector<int> &score_dim,
+                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
+                   const int& background_label, const int& nms_top_k,
+                   const int& keep_top_k, const T& nms_threshold,
+                   const T& nms_eta, const T& score_threshold) {
+  
+  int64_t class_num = score_dim[0];
+  int64_t predict_dim = score_dim[1];
+  int num_det = 0;
+  for (int c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    const T *score_data = scores_data + c * predict_dim;
+    
+    /// [c] is key
+    NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
+                   nms_top_k, &((*indices)[c]));
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        // PADDLE_ENFORCE_LT(idx, predict_dim);
+        score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const T *scores_data,
+                      const std::vector<int> &score_dim,
+                      const T *bboxes_data,
+                      T *outputs_data,
+                      const std::map<int, std::vector<int>>& selected_indices) {
+  int predict_dim = score_dim[1];
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    /// one batch
+    int label = it.first;
+    const T* sdata = scores_data + label * predict_dim;
+    const std::vector<int>& indices = it.second;
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      const T* bdata = bboxes_data + idx * kBBoxSize;
+      outputs_data[count * kOutputDim] = label;           // label
+      outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
+      // xmin, ymin, xmax, ymax
+      std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+      count++;
+    }
+  }
+}
+
+void MultiClassNMSCompute(NMSParam *param) {
+  assert(param->score_dim[0] == 1);
+  assert(param->box_dim[0] == 1);
+  assert (param->score_dim.size() == 3);
+  assert(param->box_dim.size() == 3);
+
+  float* outputs;
+  auto background_label = param->background_label;
+  auto nms_top_k = param->nmsTopK;
+  auto keep_top_k = param->keepTopK;
+  auto nms_threshold = param->nmsThreshold;
+  auto nms_eta = param->nmsEta;
+  auto score_threshold = param->scoreThredshold;
+
+  std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
+  std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
+  
+  std::vector<int> batch_starts = {0};
+  
+  std::map<int, std::vector<int>> indices;
+  int num_nmsed_out = 0;
+  
+  MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
+                       background_label, nms_top_k, keep_top_k, nms_threshold,
+                       nms_eta, score_threshold);
+  batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+
+  int output_size = 0;
+  int num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    outputs = new float[1];
+    outputs[0] = -1;
+    output_size = 1;
+  } else {
+    outputs = new float[num_kept * kOutputDim];
+    int64_t s = batch_starts[0];
+    int64_t e = batch_starts[1];
+    if (e > s) {
+      MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
+    }
+    output_size = num_kept * kOutputDim;
+  }
+  param->output = outputs;
+  param->output_size = output_size;
+}
+
+@implementation CPUResult
+@end
+
+@implementation NMSCompute
+
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
+  NMSParam param;
+  param.box_data = bbox;
+  param.score_data = score;
+  param.background_label = self.background_label;
+  param.scoreThredshold = self.scoreThredshold;
+  param.nmsTopK = self.nmsTopK;
+  param.keepTopK = self.keepTopK;
+  param.nmsEta = self.nmsEta;
+  param.nmsThreshold = self.nmsThreshold;
+  std::vector<int> score_dim;
+  for (int i = 0; i < self.scoreDim.count; ++i) {
+    score_dim.push_back(self.scoreDim[i].intValue);
+  }
+  param.score_dim = score_dim;
+  
+  std::vector<int> box_dim;
+  for (int i = 0; i < self.bboxDim.count; ++i) {
+    box_dim.push_back(self.bboxDim[i].intValue);
+  }
+  param.box_dim = box_dim;
+  MultiClassNMSCompute(&param);
+  CPUResult *cr = [[CPUResult alloc] init];
+  cr.output = param.output;
+  cr.outputSize = param.output_size;
+  return cr;
+}
+
+@end
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift b/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
index 62954ede17d493ae12aa104d13a75dbc062e98a0..4c38a1b7b42e21f88b3b1c8825c181bb83293a54 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
@@ -16,95 +16,110 @@ import Foundation
 
 // 自定义 ?!  如果 ?! 前的返回值为一个可选值, 则进行隐式解包, 如果有值则返回这个值, 如果为nil 则fatalError 传入的信息
 precedencegroup ExecutedOrFatalError{
-    associativity: left
-    higherThan: AssignmentPrecedence
+  associativity: left
+  higherThan: AssignmentPrecedence
 }
 infix operator ?!: ExecutedOrFatalError
 public func ?!<T>(option: T?, excuteOrError: @autoclosure () -> String) -> T{
-    if let inOpt = option {
-        return inOpt
-    }else{
-        print(excuteOrError())
-        fatalError(excuteOrError())
-    }
+  if let inOpt = option {
+    return inOpt
+  }else{
+    print(excuteOrError())
+    fatalError(excuteOrError())
+  }
 }
 
 //Lense
 struct Lense<A, B> {
-    let from: (A) -> B
-    let to: (B, A) -> A
+  let from: (A) -> B
+  let to: (B, A) -> A
 }
 
 precedencegroup CombineLense{
-    associativity: left
-    higherThan: AssignmentPrecedence
+  associativity: left
+  higherThan: AssignmentPrecedence
 }
 
 infix operator >>>: CombineLense
 func >>><A, B, C>(left: Lense<B, C>, right: Lense<A, B>) -> Lense<A, C> {
-    return Lense<A, C>.init(from: { (a) -> C in
-        left.from(right.from(a))
-    }, to: { (c, a) -> A in
-        right.to( left.to(c, right.from(a)),a)
-    })
+  return Lense<A, C>.init(from: { (a) -> C in
+    left.from(right.from(a))
+  }, to: { (c, a) -> A in
+    right.to( left.to(c, right.from(a)),a)
+  })
 }
 
 protocol CIntIndex {
-    associatedtype T;
-    subscript(index: CInt) -> T { get set};
+  associatedtype T;
+  subscript(index: CInt) -> T { get set};
 }
 
 extension Array: CIntIndex{
-    typealias T = Element
-    subscript(index: CInt) -> T {
-        get{
-            guard Int64(Int.max) >= Int64(index) else{
-                fatalError("cint index out of Int range")
-            }
-            return self[Int(index)]
-        }
-        set{
-            guard Int64(Int.max) >= Int64(index) else{
-                fatalError("cint index out of Int range")
-            }
-            self[Int(index)] = newValue
-        }
-        
+  typealias T = Element
+  subscript(index: CInt) -> T {
+    get{
+      guard Int64(Int.max) >= Int64(index) else{
+        fatalError("cint index out of Int range")
+      }
+      return self[Int(index)]
+    }
+    set{
+      guard Int64(Int.max) >= Int64(index) else{
+        fatalError("cint index out of Int range")
+      }
+      self[Int(index)] = newValue
     }
+    
+  }
 }
 
 extension Array where Element: AnyObject{
-    mutating func remove(element: Element) {
-        if let index = index(where: { (node) -> Bool in
-            return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
-        }) {
-            remove(at: index)
-        }
+  mutating func remove(element: Element) {
+    if let index = index(where: { (node) -> Bool in
+      return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
+    }) {
+      remove(at: index)
     }
-    
+  }
+  
 }
 
 //MARK: Array extension
 extension Array where Element: Comparable{
-    
-    /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
-    ///
-    /// - Parameter r: 前 r 个元素
-    /// - Returns: [(原有位置, 排好位置的元素)]
-    public func top(r: Int) -> [(Int, Element)] {
-        precondition(r <= self.count)
-        return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
+  
+  /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
+  ///
+  /// - Parameter r: 前 r 个元素
+  /// - Returns: [(原有位置, 排好位置的元素)]
+  public func top(r: Int) -> [(Int, Element)] {
+    precondition(r <= self.count)
+    return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
+  }
+}
+
+extension Array {
+  public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
+    if count < inCount {
+      return (0..<count).map{ ($0, self[$0]) }
+    } else {
+      let stride = count / inCount
+      var newArray: [(Int, Element)] = []
+      for i in 0..<inCount {
+        newArray.append((i * stride, self[i * stride]))
+      }
+      return newArray
     }
+  }
 }
 
 extension String{
-    func cStr() -> UnsafePointer<Int8>? {
-        return (self as NSString).utf8String
-    }
+  func cStr() -> UnsafePointer<Int8>? {
+    return (self as NSString).utf8String
+  }
 }
 
 func address<T: AnyObject>(o: T) -> String {
-    return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
+  return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
index b750018260f64ae89f5b3aab5cc987eee9a11415..3be8c118613b3e9d6a9247fd731cc74392392d5b 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -18,263 +18,588 @@ fileprivate var defaultMetalLibrary: MTLLibrary?
 fileprivate var paddleMobileMetalLibrary: MTLLibrary?
 
 extension MTLDevice {
-    func defaultLibrary() -> MTLLibrary {
-        if defaultMetalLibrary == nil {
-            defaultMetalLibrary = makeDefaultLibrary()
-        }
-        if let inDefaultLib = defaultMetalLibrary {
-            return inDefaultLib
-        } else {
-            fatalError(" default metal libary is nil")
-        }
+  func defaultLibrary() -> MTLLibrary {
+    if defaultMetalLibrary == nil {
+      defaultMetalLibrary = makeDefaultLibrary()
+    }
+    if let inDefaultLib = defaultMetalLibrary {
+      return inDefaultLib
+    } else {
+      fatalError(" default metal libary is nil")
+    }
+  }
+  
+  func paddleMobileLibrary() -> MTLLibrary {
+    if paddleMobileMetalLibrary == nil {
+      guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
+        fatalError("Counld't find paddle mobile library")
+      }
+      do {
+        paddleMobileMetalLibrary = try makeLibrary(filepath: path)
+      } catch _ {
+        fatalError("Counld't load paddle mobile library")
+      }
     }
     
-    func paddleMobileLibrary() -> MTLLibrary {
-        if paddleMobileMetalLibrary == nil {
-            guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
-                fatalError("Counld't find paddle mobile library")
-            }
-            do {
-                paddleMobileMetalLibrary = try makeLibrary(filepath: path)
-            } catch _ {
-                fatalError("Counld't load paddle mobile library")
-            }
-        }
-        
-        if let inPaddleMobileLib = paddleMobileMetalLibrary {
-            return inPaddleMobileLib
-        } else {
-            fatalError("PaddleMobile metal libary is nil")
-        }
+    if let inPaddleMobileLib = paddleMobileMetalLibrary {
+      return inPaddleMobileLib
+    } else {
+      fatalError("PaddleMobile metal libary is nil")
+    }
+  }
+  
+  func pipeLine(funcName: String, inPaddleMobileLib: Bool = true) -> MTLComputePipelineState {
+    let useLib = inPaddleMobileLib ? paddleMobileLibrary() : defaultLibrary()
+    guard let function = useLib.makeFunction(name: funcName) else {
+      fatalError(" function " + funcName + " not found")
+    }
+    do {
+      let pipLine = try makeComputePipelineState(function: function)
+      return pipLine
+    } catch let error {
+      print(error)
+      fatalError("make pip line error occured : \(error)")
     }
     
-    func pipeLine(funcName: String, inPaddleMobileLib: Bool = true) -> MTLComputePipelineState {
-        let useLib = inPaddleMobileLib ? paddleMobileLibrary() : defaultLibrary()
-        guard let function = useLib.makeFunction(name: funcName) else {
-            fatalError(" function " + funcName + " not found")
-        }
-        do {
-            let pipLine = try makeComputePipelineState(function: function)
-            return pipLine
-        } catch _ {
-            fatalError("make pip line error occured")
-        }
-        
+  }
+  
+  func makeBuffer<P>(value: [P]) -> MTLBuffer {
+    let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
+    let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
+    for i in 0..<value.count {
+      contents?[i] = value[i]
+    }
+    return buffer!
+  }
+  
+  func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
+    let bpR = texture.width * 4 * MemoryLayout<P>.size
+    let bpI = texture.height * bpR
+    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
+    for i in 0..<texture.arrayLength {
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
+      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+      for tx in 0..<texture.width * texture.height * 4 {
+        var k = tx
+        var xyzn: [Int] = [0, 0, 0, 0]
+        xyzn[1] = k / (texture.width * 4)
+        k %= (texture.width * 4)
+        xyzn[3] = k % 4
+        xyzn[0] = k / 4
+        xyzn[2] = i
+        cb(xyzn, pointer[tx])
+      }
     }
+  }
+  
+  func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 3)
+    assert(texture.width == ndim[3])
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(texture.arrayLength == (ndim[1] + 3) / 4)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[2] * 4 + xyzn[3]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 2)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(texture.arrayLength == 1)
     
-    func makeBuffer<P>(value: [P]) -> MTLBuffer {
-        let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
-        let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
-        for i in 0..<value.count {
-            contents?[i] = value[i]
-        }
-        return buffer!
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
     }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 1)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == 1)
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(ndim[2] == 1)
+    assert(texture.arrayLength == 1)
     
-    func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
-        
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.width = textureWidth
-        textureDesc.height = textureHeight
-        textureDesc.depth = 1
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.textureType = .type2DArray
-        textureDesc.storageMode = .shared
-        textureDesc.cpuCacheMode = .defaultCache
-        textureDesc.arrayLength = arrayLength
-        let texture = makeTexture(descriptor: textureDesc)!
-        
-        if arrayLength == 1 && value.count >= 4{
-            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: value.count * MemoryLayout<P>.size)
-            for i in 0..<value.count {
-                pointer[i] = value[i]
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    if dim.count == 3 {
+      return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 2 {
+      return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 1 {
+      return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
+    }
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    
+    assert(texture.width == ndim[2])
+    assert(texture.height == ndim[1])
+    assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
+    
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[1]
+      tg[2] = xyzn[0]
+      tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
+      tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
+    if value.count > 0 {
+      assert(value.count == dim.reduce(1) { $0 * $1 })
+    }
+    
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = ndim[2]
+    textureDesc.height = ndim[1]
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    
+    if inComputePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if inComputePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    }
+    
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
+    let texture = makeTexture(descriptor: textureDesc)!
+    
+    if value.count > 0 {
+      var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
+      rcount = rcount * 4 * ndim[1] * ndim[2]
+      var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
+      
+      for i0 in 0..<tdim[0] {
+        for i1 in 0..<tdim[1] {
+          for i2 in 0..<tdim[2] {
+            for i3 in 0..<tdim[3] {
+              let ig = [i0, i1, i2, i3]
+              let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
+              
+              let jg = transpose.map { ig[$0] }
+              let k = jg[0] * ndim[3] + jg[3]
+              let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
+              
+              nvalue[jx] = value[ix] as! Float32
             }
-            
-            let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
-            texture.replace(region: region, mipmapLevel: 0, withBytes: pointer, bytesPerRow: bytesPerRow)
-        } else {
-            
-            
-            
+          }
         }
-        
-        return texture
+      }
+      
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+      if inComputePrecision == .Float16 {
+        let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+        let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
+        float32ToFloat16(input: pointer, output: outputP, count: rcount)
+        let bpR = ndim[2] * 4 * 2
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = outputP + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
+      } else {
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+        let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = pointer + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
+      }
     }
+    return texture
+  }
+  
+  func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
+    
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = textureWidth
+    textureDesc.height = textureHeight
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.pixelFormat = .rgba32Float
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = arrayLength
+    let texture = makeTexture(descriptor: textureDesc)!
+    
+    if value.count >= 4{
+      let counts = arrayLength * 4 * textureWidth * textureHeight
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
+      for i in 0..<value.count {
+        pointer[i] = value[i]
+      }
+      for i in value.count..<counts {
+        pointer[i] = 0 as! P
+      }
+      
+      let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
+      let bytesPerImage = texture.height * bytesPerRow
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
+      for i in 0..<arrayLength {
+        let p = pointer + texture.width * texture.height * 4 * i
+        texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
+      }
+    } else {
+      
+    }
+    
+    return texture
+  }
 }
 
 extension MTLComputeCommandEncoder {
-    func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
-        let slices = (outTexture.arrayLength * 4 + 3)/4
-        
-        let width = computePipline.threadExecutionWidth
-        let height = computePipline.maxTotalThreadsPerThreadgroup/width
-        let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
-        
-//        print(" thread: threads per group: \(threadsPerGroup) ")
-//        print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
-        
-        let groupWidth = (outTexture.width + width - 1)/width
-        let groupHeight = (outTexture.height + height - 1)/height
-        let groupDepth = slices
-        let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
-        
-//        print("groups: \(groups) ")
-//        print("threads per group: \(threadsPerGroup)")
-        
-        setComputePipelineState(computePipline)
-        
-        dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
-    }
+  public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
+    let slices = (outTexture.arrayLength * 4 + 3)/4
+    
+    let width = computePipline.threadExecutionWidth
+    let height = computePipline.maxTotalThreadsPerThreadgroup/width
+    let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
+    
+//    print(" thread: threads per group: \(threadsPerGroup) ")
+//    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
+    
+    let groupWidth = (outTexture.width + width - 1)/width
+    let groupHeight = (outTexture.height + height - 1)/height
+    let groupDepth = slices
+    let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
+    
+    setComputePipelineState(computePipline)
+    
+    dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
+  }
 }
 
-
 public extension MTLTexture {
-    
-    func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
-        var arr: [P] = floatArray { (p: P) -> P in
-            return p;
+  
+  func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
+    var arr: [P] = floatArray { (p: P) -> P in
+      return p;
+    }
+    var result:  [(index: Int, value: P)] = []
+    if arr.count > 100 && stridable {
+      for j in stride(from: 0, to: arr.count , by: arr.count / 100){
+        result.append((j, arr[j]))
+      }
+    } else {
+      for j in 0..<arr.count {
+        result.append((j, arr[j]))
+      }
+    }
+    return result
+  }
+  
+  func floatArray<P, T>(res: (P) -> T) -> [T] {
+    var fArr: [T] = []
+    if textureType == .type2DArray {
+      for i in 0..<arrayLength{
+        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+        let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+        let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+        let p = bytes.assumingMemoryBound(to: P.self)
+        
+        for j in 0..<width * height * depth * 4 {
+          fArr.append(res(p[j]))
         }
-        var result:  [(index: Int, value: P)] = []
-        if arr.count > 100 && stridable {
-            for j in stride(from: 0, to: arr.count , by: arr.count / 100){
-                result.append((j, arr[j]))
-            }
+        bytes.deallocate()
+      }
+    } else if textureType == .type2D {
+      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+      let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+      let p = bytes.assumingMemoryBound(to: P.self)
+      
+      for j in 0..<width * height * 4 {
+        fArr.append(res(p[j]))
+      }
+      bytes.deallocate()
+    }
+    return fArr
+  }
+  
+  func float32Array() -> [Float32] {
+    if pixelFormat == .rgba32Float {
+      let float32Array = floatArray { (f: Float32) -> Float32 in
+        return f
+      }
+      return float32Array
+    } else if pixelFormat == .rgba16Float {
+      
+      var float16Array = floatArray { (f: Float16) -> Float16 in
+        return f
+      }
+      return float16To32(input: &float16Array, count: float16Array.count)
+    } else {
+      fatalError()
+    }
+  }
+  
+  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+    print(header)
+    print("texture: \(self)")
+    //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
+    //        print(res)
+    
+    if textureType == .type2DArray {
+      for i in 0..<arrayLength{
+        var str: String = "slice: \(i): \n"
+        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+        let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+        let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+        let p = bytes.assumingMemoryBound(to: T.self)
+        str += "2d array count : \(width * height * depth * 4) \n"
+        if stridable && width * height * depth * 4 > 20 {
+          for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
+            str += " index \(j): \(p[j])"
+          }
         } else {
-            for j in 0..<arr.count {
-                result.append((j, arr[j]))
+          for j in 0..<width * height * depth * 4 {
+            str += " index \(j): \(p[j])"
+          }
+        }
+        
+        bytes.deallocate()
+        print(str)
+      }
+    } else if textureType == .type2D {
+      var str: String = "texture 2D: "
+      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+      let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+      let p = bytes.assumingMemoryBound(to: T.self)
+      str += "2d count : \(width * width * 4) \n"
+      
+      if stridable {
+        for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
+          str += "index \(j): \(p[j]) "
+        }
+      } else {
+        for j in 0..<width * height * 4 {
+          str += "index \(j): \(p[j]) "
+        }
+      }
+      
+      print(str)
+      bytes.deallocate()
+    }
+    return nil
+    
+  }
+  
+  // n c h w - dim
+  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
+    var textureArray: [Float32]
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
+    }
+    
+    var output: [Float32] = []
+    for s in 0..<arrayLength {
+      for c in 0..<4{
+        for h in 0..<dim.h {
+          for w in 0..<dim.w {
+            if (s * 4 + c) < dim.c {
+              let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
+              output.append(textureValue)
             }
+          }
         }
-        return result
+      }
+    }
+    return output
+  }
+  
+  func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
+//    print("origin dim: \(dim)")
+//    print("texture: ")
+//    print(self)
+    
+    var textureArray: [Float32]
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
     }
     
-    func floatArray<P, T>(res: (P) -> T) -> [T] {
-        var fArr: [T] = []
-        if textureType == .type2DArray {
-            for i in 0..<arrayLength{
-                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-                let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-                let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
-                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-                let p = bytes.assumingMemoryBound(to: P.self)
-               
-                for j in 0..<width * height * depth * 4 {
-                    fArr.append(res(p[j]))
-                }
-                bytes.deallocate()
+    var output: [Float32] = []
+    let numOfASlice = dim.h * dim.w * 4
+    for h in 0..<dim.h {
+      for w in 0..<dim.w {
+        for sliceIndex in 0..<arrayLength {
+          if sliceIndex * 4 + 4 > dim.c {
+            for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) {
+              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+              output.append(value)
             }
-        } else if textureType == .type2D {
-            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
-            let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
-            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-            let p = bytes.assumingMemoryBound(to: P.self)
-
-            for j in 0..<width * height * 4 {
-                fArr.append(res(p[j]))
+          } else {
+            for i in 0..<4 {
+              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+              output.append(value)
             }
-            bytes.deallocate()
+          }
         }
-        return fArr
+      }
     }
-    
-    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-        print(header)
-        print("texture: \(self)")
-        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
-        print(res)
+    return output
+  }
   
-//        if textureType == .type2DArray {
-//            for i in 0..<arrayLength{
-//                var str: String = "slice: \(i): \n"
-//                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-//                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-//                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
-//                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-//                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-//                let p = bytes.assumingMemoryBound(to: T.self)
-//                str += "2d array count : \(width * height * depth * 4) \n"
-//                if stridable && width * height * depth * 4 > 100 {
-//                    for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 100){
-//                        str += " index \(j): \(p[j])"
-//                    }
-//                } else {
-//                    for j in 0..<width * height * depth * 4 {
-//                        str += " index \(j): \(p[j])"
-//                    }
-//                }
-//
-//                bytes.deallocate()
-//                print(str)
-//            }
-//        } else if textureType == .type2D {
-//            var str: String = "texture 2D: "
-//            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-//            let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-//            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-//            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-//            let p = bytes.assumingMemoryBound(to: T.self)
-//            str += "2d count : \(width * width * 4) \n"
-//
-//            if stridable {
-//                for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 100){
-//                    str += "index \(j): \(p[j]) "
-//                }
-//            } else {
-//                for j in 0..<width * height * 4 {
-//                    str += "index \(j): \(p[j]) "
-//                }
-//            }
-//
-//            print(str)
-//            bytes.deallocate()
-//        }
-        return nil
-           
-    }
 }
 
 
 public extension MTLBuffer {
-    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
-        print(header)
-        print("MTLBuffer: \(self) ")
-        var str = ""
-        if stridable && length/MemoryLayout<T>.stride > 1000{
-            for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
-                str += " \(contents().assumingMemoryBound(to: T.self)[j])"
-            }
-        } else {
-            for i in 0..<length/MemoryLayout<T>.size {
-                str += " \(contents().assumingMemoryBound(to: T.self)[i])"
-            }
-        }
-        print(str)
-        return nil
+  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+    print(header)
+    print("MTLBuffer: \(self) ")
+    var str = ""
+    if stridable && length/MemoryLayout<T>.stride > 1000{
+      for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
+        str += " \(contents().assumingMemoryBound(to: T.self)[j])"
+      }
+    } else {
+      for i in 0..<length/MemoryLayout<T>.size {
+        str += " \(contents().assumingMemoryBound(to: T.self)[i])"
+      }
     }
-    
-    func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.width = textureWidth
-        textureDesc.height = textureHeight
-        textureDesc.depth = 1
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.textureType = .type2DArray
-        textureDesc.storageMode = .shared
-        textureDesc.cpuCacheMode = .defaultCache
-        textureDesc.arrayLength = arrayLength
-        let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
-        return texture
+    print(str)
+    return nil
+  }
+  
+  func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = textureWidth
+    textureDesc.height = textureHeight
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.pixelFormat = .rgba32Float
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = arrayLength
+    let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
+    return texture
+  }
+  
+  func array<T>() -> [T] {
+    var array: [T] = []
+    let pointer = contents().bindMemory(to: T.self, capacity: length)
+    for i in 0..<(length / MemoryLayout<T>.size) {
+      array.append(pointer[i])
     }
-    
-    
-
+    return array;
+  }
 }
 
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
index a2927c4693c35fd8181d891cc33fa27c2c4cf0b9..91afae6f6415d187a69063381f3a27a6bbe92b81 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
@@ -1,10 +1,16 @@
-//
-//  TestConvAddBatchNormRelu.swift
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/7/25.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
 
 import Metal
 import Foundation
@@ -17,6 +23,204 @@ public class PaddleMobileUnitTest {
         queue = inQueue
     }
     
+    private func indentPrintTensor(tensor: [Float32], dim: [Int], ix: [Int], indentLevel: Int) {
+        let indent = Array.init(repeating: " ", count: indentLevel).joined(separator: "")
+        var tx = ix
+        if dim.count == indentLevel + 1 {
+            var log: String = indent + "["
+            for i in 0..<dim[indentLevel] {
+                tx = ix
+                tx[indentLevel] = i
+                for x in 1..<dim.count {
+                    for y in 0..<x {
+                        tx[y] *= dim[x]
+                    }
+                }
+                let c = tx.reduce(0) { $0 + $1 }
+                if i > 0 {
+                    log += ", "
+                }
+                log += tensor[c].description
+            }
+            log += "]"
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                log += ","
+            }
+            print(log)
+        } else {
+            print(indent + "[")
+            for i in 0..<dim[indentLevel] {
+                tx[indentLevel] = i
+                indentPrintTensor(tensor: tensor, dim: dim, ix: tx, indentLevel: indentLevel + 1)
+            }
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                print(indent + "],")
+            } else {
+                print(indent + "]")
+            }
+        }
+    }
+    
+    private func tensorPrint(tensor: [Float32], dim: [Int]) {
+        var detectPos = -1
+        var odim = 1
+        var ndim = dim
+        for i in 0..<dim.count {
+            if dim[i] == -1 {
+                if detectPos == -1 {
+                    detectPos = i
+                } else {
+                    detectPos = -2
+                }
+            } else if dim[i] <= 0 {
+                detectPos = -3
+            } else {
+                odim *= dim[i]
+            }
+        }
+        assert(detectPos >= -1)
+        if (detectPos == -1) {
+            assert(tensor.count == odim)
+        } else {
+            assert(tensor.count % odim == 0)
+            ndim[detectPos] = tensor.count / odim
+        }
+        indentPrintTensor(tensor: tensor, dim: ndim, ix: dim.map { $0 * 0 }, indentLevel: 0)
+    }
+    
+    public func testConcat() {
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        var it: [[Float32]] = []
+//        for _ in 0..<7 {
+//            it.append((0..<12).map { Float32($0) })
+//        }
+//        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
+//        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
+//
+//        let param = ConcatTestParam.init(
+//            input: input,
+//            output: output,
+//            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
+//            axis: 1,
+//            odim: [3, 28]
+//        )
+//        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
+//        concatKernel.test(cmdBuffer: buffer, param: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            for i in 0..<it.count {
+//                let _: Float32? = input[i].logDesc()
+//                self.tensorPrint(tensor: it[i], dim: [3, 4])
+//            }
+//            let _: Float32? = output.logDesc()
+//            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
+//            self.tensorPrint(tensor: tx, dim: [3, 28])
+//        }
+//
+//        buffer.commit()
+    }
+    
+    public func testReshape() {
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
+//        let mp = ReshapeMetalParam.init(
+//            idim: (1, 2, 3, 4),
+//            itrans: (0, 1, 2, 3),
+//            odim: (1, 1, 4, 6),
+//            otrans: (0, 1, 2, 3)
+//        )
+//        let param = ReshapeTestParam.init(
+//            inputTexture: inTexture,
+//            outputTexture: outTexture,
+//            param: mp
+//        )
+//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inTexture.logDesc()
+//            let _: Float32? = outTexture.logDesc()
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
+//            self.tensorPrint(tensor: tx, dim: [4, 6])
+//        }
+        
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
+//        let mp = ReshapeMetalParam.init(
+//            idim: (1, 2, 3, 4),
+//            itrans: (0, 1, 2, 3),
+//            odim: (1, 1, 1, 24),
+//            otrans: (0, 1, 2, 3)
+//        )
+//        let param = ReshapeTestParam.init(
+//            inputTexture: inTexture,
+//            outputTexture: outTexture,
+//            param: mp
+//        )
+//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inTexture.logDesc()
+//            let _: Float32? = outTexture.logDesc()
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
+//            self.tensorPrint(tensor: tx, dim: [24])
+//        }
+//
+//        
+//        buffer.commit()
+    }
+    
+    public func testTranspose() {
+
+        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        var input: [Float32] = []
+//        for i in 0..<72 {
+//            input.append(Float32(i))
+//        }
+////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
+//        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
+//        // group 1
+//        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
+////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
+////        // group 2
+////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
+////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
+////
+//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//
+//        transposeKernel.test(commandBuffer: buffer, param: param)
+//
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+//            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
+//            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
+//        }
+//
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
+//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//
+//        transposeKernel.test(commandBuffer: buffer, param: param)
+//
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
+//            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
+//        }
+//        
+        buffer.commit()
+    }
+    
     public func testConvAddBnRelu() {
         let buffer = queue.makeCommandBuffer() ?! " buffer is nil "
         
@@ -116,7 +320,7 @@ public class PaddleMobileUnitTest {
         let offsetX = filterSize.width/2 - paddings.0
         let offsetY = filterSize.height/2 - paddings.1
         
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0))
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1))
         
         let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
         
@@ -132,16 +336,6 @@ public class PaddleMobileUnitTest {
         }
         
         buffer.commit()
-        
-        
-//        let inputTexture = device.makeFloatTexture(value: <#T##[P]#>, textureWidth: <#T##Int#>, textureHeight: <#T##Int#>, arrayLength: <#T##Int#>)
-        
-        
-//        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: <#T##MTLTexture#>, inOutputTexture: <#T##MTLTexture#>, inMetalParam: <#T##MetalConvParam#>, inFilterBuffer: <#T##MTLBuffer#>, inBiaseBuffer: <#T##MTLBuffer#>, inNewScaleBuffer: <#T##MTLBuffer#>, inNewBiaseBuffer: <#T##MTLBuffer#>, inFilterSize: <#T##(width: Int, height: Int, channel: Int)#>)
-        
-//        ConvAddBatchNormReluKernel.init(device: <#T##MTLDevice#>, testParam: <#T##ConvAddBatchNormReluTestParam#>)
-        
-        
     }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
index 930198fbf9c2cbfd917ddcb9ecb1fe02767c21f9..23ad7113971de3d0843abe17accfe3d67f0caaa9 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
@@ -1,10 +1,16 @@
-//
-//  Tools.swift
-//  paddle-mobile
-//
-//  Created by liuRuiLong on 2018/7/26.
-//  Copyright © 2018年 orange. All rights reserved.
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
 
 import Foundation
 
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
index 98353617f5090f1eeac0c644c17548555638a6ca..a1197ed2188a263af3c0819fec09b584af501dd3 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
@@ -13,80 +13,228 @@
  limitations under the License. */
 
 import Foundation
+import Accelerate
 
 public protocol SummableMultipliable: Equatable {
-    static func +(lhs: Self, rhs: Self) -> Self
-    static func *(lhs: Self, rhs: Self) -> Self
-    static func -(lhs: Self, rhs: Self) -> Self
+  static func +(lhs: Self, rhs: Self) -> Self
+  static func *(lhs: Self, rhs: Self) -> Self
+  static func -(lhs: Self, rhs: Self) -> Self
 }
 public protocol PrecisionType: SummableMultipliable{
-    init(inFloat: Float32)
-    init(inFloat16: Float16)
-    init<P: PrecisionType>(_ inP: P)
-    static var bitSize: UInt { get }
+  init(inFloat: Float32)
+  init(inFloat16: Float16)
+  init<P: PrecisionType>(_ inP: P)
+  static var bitSize: UInt { get }
 }
 
 public typealias Float16 = Int16
 extension Float16: PrecisionType {
-    public static func * (prefix: Float16, postfix: Float16) {
-        return prefix * postfix
+  public static func * (prefix: Float16, postfix: Float16) {
+    return prefix * postfix
+  }
+  
+  public init<P>(_ inP: P) where P : PrecisionType {
+    if P.bitSize == Float32.bitSize {
+      self = Float16(inFloat: inP as! Float32)
+    } else if P.bitSize == Float16.bitSize {
+      self = inP as! Float16
+    } else {
+      fatalError()
     }
-    
-    public init<P>(_ inP: P) where P : PrecisionType {
-        if P.bitSize == Float32.bitSize {
-            self = Float16(inFloat: inP as! Float32)
-        } else if P.bitSize == Float16.bitSize {
-            self = inP as! Float16
-        } else {
-            fatalError()
+  }
+  
+  public static var bitSize: UInt {
+    return 16
+  }
+  
+  public init(inFloat16: Float16) {
+    self = inFloat16
+  }
+  public init(inFloat: Float32) {
+    self = Int16(inFloat)
+  }
+}
+
+extension Float32: PrecisionType {
+  public init<P>(_ inP: P) where P : PrecisionType {
+    if P.bitSize == Float32.bitSize {
+      self = inP as! Float32
+    } else if P.bitSize == Float16.bitSize {
+      self = Float32.init(inP as! Float16)
+    } else {
+      fatalError()
+    }
+  }
+  
+  public init(inFloat: Float32) {
+    self = inFloat
+  }
+  
+  public init(inFloat16: Float16) {
+    self = Float32.init(inFloat16)
+  }
+  
+  public static var bitSize: UInt {
+    return 32
+  }
+}
+
+public func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
+  var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+  var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+  guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+    fatalError(" float 32 to float 16 error ! ")
+  }
+}
+
+public func float16To32(input: UnsafeMutablePointer<Float16>, count: Int) -> [Float32] {
+  var output = Array<Float>.init(repeating: 0.0, count: count)
+  float16to32(input: input, output: &output, count: count)
+  return output
+}
+
+public func float16to32(input: UnsafeMutablePointer<Float16>, output: UnsafeMutablePointer<Float32>, count: Int) {
+  var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
+  var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
+  if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
+    fatalError(" convert float16 to float32 error")
+  }
+}
+
+// N - 0   C - 1   H - 2   W - 3
+struct DataLayout {
+  
+  static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+    return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
+  }
+  
+  static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+    return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
+  }
+  
+  func count() -> Int {
+    return layoutWithDim.count
+  }
+  
+  var N: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .N {
+          return layoutDim.1
         }
+      }
+      return nil
     }
-    
-    public static var bitSize: UInt {
-        return 16
+    set {
+      var newN = (Layout.N, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .N
+      }) {
+        fatalError()
+      }
     }
-    
-    public init(inFloat16: Float16) {
-        self = inFloat16
+  }
+  var C: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .C {
+          return layoutDim.1
+        }
+      }
+      return nil
     }
-    public init(inFloat: Float32) {
-        self = Int16(inFloat)
+    set {
+      var newN = (Layout.C, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .N
+      }) {
+        fatalError()
+      }
     }
-    
-    
-    
-}
-
-extension Float32: PrecisionType {
-    public init<P>(_ inP: P) where P : PrecisionType {
-        if P.bitSize == Float32.bitSize {
-            self = inP as! Float32
-        } else if P.bitSize == Float16.bitSize {
-            self = Float32.init(inP as! Float16)
-        } else {
-            fatalError()
+  }
+  var H: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .H {
+          return layoutDim.1
         }
+      }
+      return nil
     }
-    
-    public init(inFloat: Float32) {
-        self = inFloat
+    set {
+      var newN = (Layout.H, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .H
+      }) {
+        fatalError()
+      }
     }
-    
-    public init(inFloat16: Float16) {
-        self = Float32.init(inFloat16)
+  }
+  var W: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .W {
+          return layoutDim.1
+        }
+      }
+      return nil
+    }
+    set {
+      var newN = (Layout.W, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .W
+      }) {
+        fatalError()
+      }
     }
+  }
+  
+  
+  init(_ inLayout: [(Layout, Int)]) {
+    layoutWithDim = inLayout
+  }
+  
+  func layout() -> [Layout] {
+    return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
+      return layout
+    })
+  }
+  
+  var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
+  
+  func convertTo(inLayout: [Layout]) {
     
-    public static var bitSize: UInt {
-        return 32
+  }
+  
+  enum Layout: Int{
+    case N = 0
+    case C = 1
+    case H = 2
+    case W = 3
+    static func defaultLayout() -> [Layout] {
+      return [N, C, H, W]
     }
+  }
 }
 
-public enum DataLayout {
-    case NCHW
-    case NHWC
+extension DataLayout: Equatable {
+  public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
+    if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
+      var result = true
+      for i in 0..<lhs.layoutWithDim.count {
+        result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
+        if !result {
+          break
+        }
+      }
+      return result
+    } else {
+      return false
+    }
+  }
 }
 
-protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
+public protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
 }
 
 extension Tensor: Variant {
@@ -95,12 +243,52 @@ extension Tensor: Variant {
 extension Texture: Variant {
 }
 
-extension ResultHolder: Variant {
+extension GPUResultHolder: Variant {
 }
 
 extension InputTexture: Variant {
 }
 
 extension MTLTexture where Self: Variant {
-    
+  
 }
+
+class FetchHolder: Variant {
+  var resultBuffer: MTLBuffer?
+  var dim: [Int]
+  var capacity: Int
+  
+  init(inCapacity: Int, inDim: [Int]) {
+    capacity = inCapacity
+    dim = inDim
+  }
+  
+  func initBuffer(device: MTLDevice) {
+    resultBuffer = device.makeBuffer(length: capacity * 4, options: [])
+  }
+  
+  var result: UnsafeMutablePointer<Float32> {
+    guard let inResultBuffer = resultBuffer else {
+      fatalError()
+    }
+    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: capacity)
+  }
+  
+}
+
+extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  var debugDescription: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift
deleted file mode 100644
index 0dcb3151e21cc0f3968a07da39366d4ba5fd5813..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-
-public class ResultHolder<P: PrecisionType> {
-    public let dim: [Int]
-    public let resultArr: [P]
-    public let elapsedTime: Double
-    public init(inDim: [Int], inResult: [P], inElapsedTime: Double) {
-        dim = inDim
-        resultArr = inResult
-        elapsedTime = inElapsedTime
-    }
-}
-
-extension ResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
-    public var debugDescription: String {
-        var str = ""
-        str += "Dim: \(dim) \n value:[ "
-        if resultArr.count < 20 {
-            for d in resultArr {
-                str += " \(d) "
-            }
-        } else {
-            for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
-                str += " \(resultArr[d]) "
-            }
-        }
-        str += " ]"
-        return str
-    }
-    
-    public var description: String {
-        return debugDescription
-    }
-}
-
-public class Executor<P: PrecisionType> {
-    var ops: [Runable & InferShaperable] = []
-    let program: Program
-    let device: MTLDevice
-    let queue: MTLCommandQueue
-    public init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program) throws {
-        program = inProgram
-        device = inDevice
-        queue = inQueue
-        for block in inProgram.programDesc.blocks {
-            //block.ops.count
-            for i in 0..<block.ops.count {
-                let op = block.ops[i]
-                do {
-                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-                    op.inferShape()
-                    ops.append(op)
-                } catch let error {
-                    throw error
-                }
-            }
-            
-//            for op in block.ops {
-//                do {
-//                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-//                    op.inferShape()
-//                    ops.append(op)
-//                } catch let error {
-//                    throw error
-//                }
-//            }
-        }
-    }
-    
-    public func predict(input: MTLTexture, expect: [Int], completionHandle: @escaping (ResultHolder<P>) -> Void, preProcessKernle: CusomKernel? = nil) throws {
-        guard let buffer = queue.makeCommandBuffer() else {
-            throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
-        }
-        let resInput: MTLTexture
-        if let inPre = preProcessKernle {
-            do {
-                try inPre.compute(inputTexuture: input, commandBuffer: buffer)
-                resInput = inPre.outputTexture
-            } catch let error {
-                throw error
-            }
-        } else {
-            resInput = input
-        }
-        
-        let beforeDate = Date.init()
-        let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: expect))
-        program.scope.setInput(input: inputTexture)
- 
-        for op in ops {
-            do {
-                try op.run(device: device, buffer: buffer)
-            } catch let error {
-                throw error
-            }
-        }
-        
-        buffer.addCompletedHandler { (commandbuffer) in
-//            let inputArr = resInput.floatArray(res: { (p:P) -> P in
-//                return p
-//            })
-//            print(inputArr)
-            
-//            let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
-//            print(stridableInput)
-            
-//            let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-//            for op in self.ops {
-//                op.delogOutput()
-//            }
-//            return
-            
-//            self.ops[2].delogOutput()
-            
-            
-            let afterDate = Date.init()
-            
-            guard let outputVar = self.program.scope.output() else {
-                fatalError("output nil")
-            }
-
-            guard let output = outputVar as? Texture<P> else {
-                fatalError("output var type error")
-            }
-            let resultHodlder = ResultHolder<P>.init(inDim: output.dim.dims, inResult: output.metalTexture.floatArray(res: { (p:P) -> P in
-                return p
-            }), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
-            completionHandle(resultHodlder)
-        }
-        buffer.commit()
-    }
-    
-    public func clear() {
-        program.scope.clear()
-    }
-    
-}
-
-//public let paddle_executor: Executor = Executor.init()
diff --git a/metal/paddle-mobile/paddle-mobile/Genet.swift b/metal/paddle-mobile/paddle-mobile/Genet.swift
new file mode 100644
index 0000000000000000000000000000000000000000..d803d1e99537e3a24d1fae5a5653d680bd811ac2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Genet.swift
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class Genet: Net {
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [128.0, 128.0, 128.0]
+    scale = 0.017
+    except = 0
+    modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = GenetPreProccess.init(device: device)
+    dim = (n: 1, h: 128, w: 128, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [128.0, 128.0, 128.0]
+    scale = 0.017
+    except = 0
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = GenetPreProccess.init(device: device)
+    dim = (n: 1, h: 128, w: 128, c: 3)
+  }
+
+  class GenetPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
+      super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override  public func resultStr(res: ResultHolder) -> String {
+//    fatalError()
+    return " \(res.result![0]) ... "
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift
deleted file mode 100644
index c68b68e1caffcadc2adb2b4ddf245c89b2c5a223..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Foundation
-import SwiftProtobuf
-
-public class Loader<P: PrecisionType> {
-    class ParaLoader {
-        let file: UnsafeMutablePointer<FILE>
-        let fileSize: Int
-        var nowIndex: Int
-        init(paramPath: String) throws {
-            guard let tmpFile = fopen(paramPath, "rb") else {
-                throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
-            }
-            file = tmpFile
-            fseek(file, 0, SEEK_END)
-            fileSize = ftell(file)
-            guard fileSize > 0 else {
-                throw PaddleMobileError.loaderError(message: "param file size is too small")
-            }
-            rewind(file)
-            nowIndex = 0
-        }
-        
-        func read(tensor: Tensor<P>) throws {
-            guard nowIndex <= fileSize else {
-                throw PaddleMobileError.loaderError(message: "out of the file range")
-            }
-            
-            func pointerReader<T>(type: T.Type) -> T {
-                let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
-                fread(ptr, 1, MemoryLayout<T>.size, file)
-                nowIndex += MemoryLayout<T>.size
-                let pointee = ptr.pointee
-                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-                ptr.deallocate()
-                return pointee
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            let lodLevel = pointerReader(type: UInt64.self)
-            for _ in 0..<lodLevel {
-                let size = pointerReader(type: UInt64.self)
-                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-                    _ = pointerReader(type: size_t.self)
-                }
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            
-            let tensorDescSize = pointerReader(type: Int32.self)
-            
-            fseek(file, Int(tensorDescSize), SEEK_CUR)
-            nowIndex += Int(tensorDescSize)
-            
-            /*
-             这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
-             */
-            
-            //现在模型传入模型为  Float 类型, 这块应该根据模型来
-//            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-//            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
-            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
-            
-            guard bytesRead == tensor.data.size else {
-                throw PaddleMobileError.loaderError(message: "param read size error")
-            }
-            
-            // TODO: use script to convert
-//            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-//            for i in 0..<tensor.numel() {
-//                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-//            }
-//            tmpPointer.deinitialize(count: tmpCapacity)
-//            tmpPointer.deallocate()
-            
-            nowIndex += bytesRead
-        }
-        
-        deinit {
-            fclose(file)
-        }
-    }
-    public init(){}
-    public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
-        guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
-            throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
-        }
-        
-        do {
-            let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
-                serializedData: modelData)
-            
-            let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
-            let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
-            print(programDesc)
-
-            guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
-                throw PaddleMobileError.loaderError(message: "load para error")
-            }
-            
-            guard programDesc.blocks.count > 0 else {
-                throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
-            }
-            
-            // to get feed key and fetch key
-            let block = programDesc.blocks[0]
-            guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
-                throw PaddleMobileError.loaderError(message: "at least two operator")
-            }
-            guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
-                throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
-            }
-            
-            guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
-                throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
-            }
-            guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
-                throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
-            }
-            
-            let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
-            
-            // to load memory
-            for block in programDesc.blocks {
-                for varDesc in block.vars {
-                    if (varDesc.type == .LodTensor) {
-                        guard let tensorDesc = varDesc.tensorDesc else {
-                            throw PaddleMobileError.loaderError(message: "get tensor desc failed")
-                        }
-                        
-//                        guard (try? tensorDesc.dataType.dataTypeSize()) == MemoryLayout<P>.size else {
-//                            throw PaddleMobileError.memoryError(message: "PrecisionType not support")
-//                        }
-                        
-                        if (varDesc.persistable
-                            && varDesc.type != .FeedMiniBatch
-                            && varDesc.type != .FetchList) {
-                            let dimArr = tensorDesc.dims
-                            
-                            guard dimArr.count > 0 else {
-                                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
-                            }
-                            
-                            let dim = Dim.init(inDim: dimArr)
-                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
-                            do {
-                                try paraLoader.read(tensor: tensor)
-                            } catch let error {
-                                throw error
-                            }
-                            tensor.convert(to: .NHWC)
-//                            tensor.initBuffer(device: device)
-                            scope[varDesc.name] = tensor
-                        } else {
-                            let dim = Dim.init(inDim: tensorDesc.NHWCDim)
-                            scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
-                        }
-                    } else {
-                        if varDesc.name == fetchKey {
-                            scope[varDesc.name] = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: 0.0)
-                        } else if varDesc.name == feedKey {
-                        }
-                    }
-                }
-            }
-            
-            let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope)
-            
-            return program
-        } catch _ {
-            throw PaddleMobileError.loaderError(message: "protobuf decoder error")
-        }
-    }
-}
diff --git a/metal/paddle-mobile/paddle-mobile/MobileNet.swift b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7d10a920d15e751f29fce7f9f6be71cd6a2d6b69
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MobileNet: Net{
+  
+  class MobilenetPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  class PreWords {
+    var contents: [String] = []
+    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+        let string = try! String.init(contentsOfFile: filePath)
+        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+        }
+      }else{
+        fatalError("no file call \(fileName)")
+      }
+    }
+    subscript(index: Int) -> String {
+      return contents[index]
+    }
+  }
+  
+  let labels = PreWords.init(fileName: "synset")
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    guard let resPointer = res.result else {
+      fatalError()
+    }
+    var s: [String] = []
+    (0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    }
+    return s.joined(separator: "\n")
+  }
+  
+
+  
+  override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 0
+    modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetPreProccess.init(device: device)
+    dim = (n: 1, h: 224, w: 224, c: 3)
+  }
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
new file mode 100644
index 0000000000000000000000000000000000000000..667cfa72c7f9409b641ef9061d9a82f212e97aac
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
@@ -0,0 +1,100 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class MobileNet_ssd_hand: Net{
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 2
+    modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 300, w: 300, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 2
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 300, w: 300, c: 3)
+  }
+  
+  class MobilenetssdPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    return " \(res)"
+  }
+  
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+
+//    guard let interRes = paddleMobileRes.intermediateResults else {
+//      fatalError(" need have inter result ")
+//    }
+//
+//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+//      fatalError(" need score ")
+//    }
+//
+//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+//      fatalError()
+//    }
+//
+//    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
+////    print("score: ")
+////    print(scoreFormatArr.strideArray())
+////
+//    var bboxArr = bbox.metalTexture.float32Array()
+////    print("bbox: ")
+////    print(bboxArr.strideArray())
+//
+//    let nmsCompute = NMSCompute.init()
+//    nmsCompute.scoreThredshold = 0.01
+//    nmsCompute.nmsTopK = 400
+//    nmsCompute.keepTopK = 200
+//    nmsCompute.nmsEta = 1.0
+//    nmsCompute.nmsThreshold = 0.45
+//    nmsCompute.background_label = 0;
+//
+//    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+//
+//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+//    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+//      fatalError( " result error " )
+//    }
+//
+//    let output: [Float32] = result.map { $0.floatValue }
+//
+//
+//    return output
+    fatalError()
+  }
+  
+
+  
+ 
+}
diff --git a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
new file mode 100644
index 0000000000000000000000000000000000000000..6c7bd9b9c6ae4f55327a370ceb1e682a8e5e7658
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
@@ -0,0 +1,153 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class MobileNet_ssd_AR: Net{
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [103.94, 116.78, 123.68]
+    scale = 1
+    except = 2
+    modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 160, w: 160, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [103.94, 116.78, 123.68]
+    scale = 1
+    except = 2
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 160, w: 160, c: 3)
+  }
+  
+  class MobilenetssdPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    return " \(res.result![0])"
+  }
+  
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    guard let interRes = paddleMobileRes.intermediateResults else {
+      fatalError(" need have inter result ")
+    }
+    
+    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
+      fatalError(" need score ")
+    }
+    
+    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
+      fatalError()
+    }
+    
+//    let startDate = Date.init()
+    
+//    print("scoreFormatArr: ")
+//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
+//
+//    print("bbox arr: ")
+//
+//    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
+    
+    let nmsCompute = NMSCompute.init()
+    nmsCompute.scoreThredshold = 0.25
+    nmsCompute.nmsTopK = 100
+    nmsCompute.keepTopK = 100
+    nmsCompute.nmsEta = 1.0
+    nmsCompute.nmsThreshold = 0.449999988
+    nmsCompute.background_label = 0;
+    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
+    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
+    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
+      fatalError( " result error " )
+    }
+    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+//    for i in 0..<Int(result.outputSize) {
+//
+//      print("i \(i) : \(result.output[i])")
+//    }
+//    print(Date.init().timeIntervalSince(startDate))
+
+//    print(resultHolder.result![0])
+    return resultHolder
+  }
+  
+  override func updateProgram(program: Program) {
+    for i in [56, 66, 76, 86, 93, 99] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
+      
+      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
+      
+      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
+      
+      program.scope[output] = originTexture
+      
+      if i == 99 {
+        opDesc.attrs["axis"] = 0
+      } else {
+        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+      }
+    }
+    
+    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      
+      
+      
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    }
+    
+    for i in [60, 101, 90, 97, 70, 80] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    }
+    
+    for i in [102] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      for output in opDesc.outputs["Out"]! {
+        let v = program.scope[output]!
+        let originTexture = v as! Texture<Float32>
+        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      }
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+      print(" split axis \(opDesc.attrs["axis"])")
+    }
+    // 99
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Net.swift b/metal/paddle-mobile/paddle-mobile/Net.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ce9ec98a66e685eec3a688a5a29402a76567b0e2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Net.swift
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+import Foundation
+
+public class ResultHolder: NSObject {
+  @objc public let result: UnsafeMutablePointer<Float32>?
+  @objc public let capacity: Int
+
+  init(inResult: UnsafeMutablePointer<Float32>?, inCapacity: Int) {
+    result = inResult
+    capacity = inCapacity
+  }
+  
+  @objc public func releasePointer() {
+    result?.deinitialize(count: capacity)
+    result?.deallocate()
+  }
+}
+
+public class Net: NSObject {
+  var except: Int = 0
+  var means: [Float] = []
+  var scale: Float = 0.0
+  var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
+  var preprocessKernel: CusomKernel? = nil
+  var paramPointer: UnsafeMutableRawPointer? = nil
+  var paramSize: Int = 0
+  var modelPointer: UnsafeMutableRawPointer? = nil
+  var modelSize: Int = 0
+  var modelPath: String = ""
+  var paramPath: String = ""
+  var modelDir: String = ""
+  @objc public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+      self.paramPointer = paramPointer
+      self.paramSize = paramSize
+      self.modelPointer = modePointer
+      self.modelSize = modelSize
+      super.init()
+  }
+
+  
+  public func resultStr(res: ResultHolder) -> String {
+    fatalError()
+  }
+  
+  func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    return ResultHolder.init(inResult: paddleMobileRes.resultPointer, inCapacity: paddleMobileRes.capacity)
+  }
+  
+  @objc public init(device: MTLDevice) {
+    super.init()
+  }
+  
+  func updateProgram(program: Program) {
+
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
index 0ba02af1c51ba218982cc116e2cf8500cfa14db0..9806042e9eb339d6d15f2cbfebe924b548d29922 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -43,14 +43,31 @@ class OpCreator<P: PrecisionType> {
         [gConvType                  :     ConvOp<P>.creat,
          gBatchNormType             :     BatchNormOp<P>.creat,
          gReluType                  :     ReluOp<P>.creat,
-         gElementwiseAdd            :     ElementwiseAddOp<P>.creat,
+         gElementwiseAddType        :     ElementwiseAddOp<P>.creat,
          gFeedType                  :     FeedOp<P>.creat,
          gFetchType                 :     FetchOp<P>.creat,
          gConvAddBatchNormReluType  :     ConvAddBatchNormReluOp<P>.creat,
          gPooType                   :     PoolOp<P>.creat,
          gSoftmaxType               :     SoftmaxOp<P>.creat,
          gReshapeType               :     ReshapeOp<P>.creat,
-         gConvAddType               :     ConvAddOp<P>.creat]
-    
+         gConvAddType               :     ConvAddOp<P>.creat,
+         gDepthConvType             :     DepthConvOp<P>.creat,
+         gConcatType                :     ConcatOp<P>.creat,
+         gBoxcoderType              :     BoxcoderOp<P>.creat,
+         gConvBnReluType            :     ConvBNReluOp<P>.creat,
+         gDwConvBnReluType          :     DwConvBNReluOp<P>.creat,
+         gMulticlassNMSType         :     MulticlassNMSOp<P>.creat,
+         gTransposeType             :     TransposeOp<P>.creat,
+         gPriorBoxType              :     PriorBoxOp<P>.creat,
+         gPreluType                 :     PreluOp<P>.creat,
+         gConv2dTransposeType       :     ConvTransposeOp<P>.creat,
+         gBilinearInterpType        :     BilinearInterpOp<P>.creat,
+         gSplit                     :     SplitOp<P>.creat,
+         gShape                     :     ShapeOp<P>.creat,
+         gFlatten                   :     FlattenOp<P>.creat,
+         gConvAddPreluType          :     ConvAddPreluOp<P>.creat,
+         gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
+         gElementwiseAddPreluType:   ElementwiseAddPreluOp<P>.creat]
+  
     private init(){}
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
index 43f095d7008ad14ac71d610728e19ac6f6817800..9f868e35864d59be5711c4ac0a02787638eeae8f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
@@ -22,147 +22,199 @@ import Foundation
  */
 
 protocol OpParam {
-    associatedtype OutputType: Variant
-    var output: OutputType { get set }
-    func outputDesc() -> String
-    
-    associatedtype ParamPrecisionType: PrecisionType
-    init(opDesc: OpDesc, inScope: Scope) throws
-    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
-    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
-    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
-    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
-    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+  associatedtype OutputType: Variant
+  var output: OutputType { get set }
+  func outputDesc() -> String
+  
+  associatedtype ParamPrecisionType: PrecisionType
+  init(opDesc: OpDesc, inScope: Scope) throws
+  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
+  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
+  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+  
+  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
 }
 
 extension OpParam {
-    func outputDesc() -> String {
-        return output.debugDescription
+  func outputDesc() -> String {
+    return output.debugDescription
+  }
+  
+  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
+    guard let mapKeys = map[key], mapKeys.count > 0 else {
+      throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
     }
-    
-    static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
-        guard let mapKeys = map[key], mapKeys.count > 0 else {
-            throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
-        }
-        guard let variant = from[mapKeys[0]], let v = variant as? VarType else {
-            throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
-        }
-        return v
+    guard let variant = from[mapKeys[0]] else {
+      throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
     }
     
-    static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
-            
-            return tensorX
-        } catch let error {
-            throw error
-        }
+    guard let v = variant as? VarType else {
+      throw PaddleMobileError.paramError(message: " type error")
+
     }
-    
-    static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
-            return tensorInput
-        } catch let error {
-            throw error
-        }
+    return v
+  }
+  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
+      return tensorVariances
+    } catch let error {
+      throw error
     }
-    
-    static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
-            return tensorOutput
-        } catch let error {
-            throw error
-        }
-    }
-    static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
-            return tensorOutputY
-        } catch let error {
-            throw error
-        }
-    }
-    static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
-            return tensorY
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
+      return alphaTensor
+    } catch let error {
+      throw error
     }
-    
-    static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
-            return out
-        } catch let error {
-            throw error
-        }
-    }
-    static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
-            return tensorFilter
-        } catch let error {
-            throw error
-        }
+  }
+  
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
+      return tensorImage
+    } catch let error {
+      throw error
     }
-    
-    static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
-            return tensorBias
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
+      return tensorX
+    } catch let error {
+      throw error
     }
-    
-    static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
-            return tensorMean
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
+      return tensorBox
+    } catch let error {
+      throw error
     }
-    
-    static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
-            return tensorScale
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
+      return tensorInput
+    } catch let error {
+      throw error
     }
-    
-    static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
-        do {
-            let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
-            return tensorVariance
-        } catch let error {
-            throw error
-        }
+  }
+  
+  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
+      return tensorOutput
+    } catch let error {
+      throw error
+    }
+  }
+  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
+      return tensorOutputY
+    } catch let error {
+      throw error
+    }
+  }
+  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
+      return tensorY
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
+      return out
+    } catch let error {
+      throw error
+    }
+  }
+  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
+      return tensorFilter
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
+      return tensorBias
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
+      return tensorMean
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
+      return tensorScale
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
+      return tensorVariance
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
+    guard let attr = attrs[key] else {
+      throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
     }
     
-    static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
-        guard let attr = attrs[key] else {
-            throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
-        }
-        
-        guard let tAttr = attr as? T else {
-            throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
-        }
-        return tAttr
+    guard let tAttr = attr as? T else {
+      throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
     }
+    return tAttr
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
index bc95f84d8ae98cb8e4e7151f0cf69a574699dc80..40698da5ecb047dbf557cea18556616020ee9750 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -16,100 +16,118 @@ import Metal
 import Foundation
 
 protocol Fusion {
-    static func fusionNode() -> Node
-    static func change() -> [String : [(from: String, to: String)]]
-    static func fusionType() -> String
+  static func fusionNode() -> Node
+  static func change() -> [String : [(from: String, to: String)]]
+  static func fusionType() -> String
+  static func needCheck() -> [(Int, String)]
+}
+extension Fusion {
+  static func needCheck() -> [(Int, String)] {
+    return []
+  }
 }
 
 protocol Runable {
-    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
-    func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
-    func delogOutput()
+  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
+  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
+  func delogOutput()
+  func inputVariant() -> [String : [Variant]]
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
 }
 
 extension Runable where Self: OperatorProtocol{
-    func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try runImpl(device: device, buffer: buffer)
-        } catch let error {
-            throw error
-        }
-//        print(type + ": " + para.outputDesc())
+  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try runImpl(device: device, buffer: buffer)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func inputVariant() -> [String : [Variant]] {
+//    return [:]
+    fatalError(" op \(type) need implement inputVariant")
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    fatalError(" need implement ")
+  }
+  
+  func delogOutput() {
     
-    func delogOutput() {
-        print(type + ": has no implementation" )
-    }
+    print(type + ": has no implementation" )
+  }
 }
 
 protocol Creator where Self: OperatorProtocol{
-    associatedtype OpType: OperatorProtocol & Runable & InferShaperable
-    static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType
+  associatedtype OpType: OperatorProtocol & Runable & InferShaperable
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType
 }
 
 extension Creator where Self: OperatorProtocol {
-    static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType {
-        do {
-            return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType {
+    do {
+      return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
     }
+  }
 }
 
 protocol InferShaperable {
-    func inferShape()
+  func inferShape()
 }
 
 protocol OperatorProtocol {
-    associatedtype ParamType
-    associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
-    var type: String { get }
-    var scope: Scope { get }
-    var inputs: [String : [String]] { get }
-    var paraInputs: [String : [String]] { get set }
-    var outpus: [String : [String]] { get }
-    var attrs: [String : Attr] { get }
-    var para: ParamType { get }
-    var kernel: KerType { get }
-    init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws
+  associatedtype ParamType
+  associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
+  var type: String { get }
+  var scope: Scope { get }
+  var inputs: [String : [String]] { get }
+  var paraInputs: [String : [String]] { get set }
+  var outpus: [String : [String]] { get }
+  var attrs: [String : Attr] { get }
+  var para: ParamType { get }
+  var kernel: KerType { get }
+  init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws
 }
 
 extension OperatorProtocol {
-    static func provide(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> Self {
-        do {
-            return try Self.init(device: device, opDesc: opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
+  static func provide(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> Self {
+    do {
+      return try Self.init(device: device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
     }
+  }
 }
 
 class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where KernelType.ParamType == ParameterType {
-    typealias ParamType = ParameterType
-    typealias KerType = KernelType
-    let type: String
-    let inputs: [String : [String]]
-    var paraInputs: [String : [String]]
-    let outpus: [String : [String]]
-    let attrs: [String : Attr]
-    let para: ParamType
-    let scope: Scope
-    var kernel: KerType
-    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
-        type = opDesc.type
-        scope = inScope
-        inputs = opDesc.inputs
-        outpus = opDesc.outputs
-        attrs =  opDesc.attrs
-        paraInputs = opDesc.paraInputs
-        do {
-            para = try ParamType.init(opDesc:opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
-        kernel = KernelType.init(device: device, param: para)
+  typealias ParamType = ParameterType
+  typealias KerType = KernelType
+  let type: String
+  let inputs: [String : [String]]
+  var paraInputs: [String : [String]]
+  let outpus: [String : [String]]
+  let attrs: [String : Attr]
+  let para: ParamType
+  let scope: Scope
+  var kernel: KerType
+  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+//    print("create op: \(opDesc.type)")
+    type = opDesc.type
+    scope = inScope
+    inputs = opDesc.inputs
+    outpus = opDesc.outputs
+    attrs =  opDesc.attrs
+    paraInputs = opDesc.paraInputs
+    do {
+      para = try ParamType.init(opDesc:opDesc, inScope: inScope)
+    } catch let error {
+      throw error
     }
+    kernel = KernelType.init(device: device, param: para)
+  }
 }
 
 // op infos
@@ -118,22 +136,57 @@ let gFeedType                   = "feed"
 let gConvType                   = "conv2d"
 let gBatchNormType              = "batch_norm"
 let gReluType                   = "relu"
-let gElementwiseAdd             = "elementwise_add"
+let gElementwiseAddType         = "elementwise_add"
 let gConvAddBatchNormReluType   = "conv_add_batchnorm_relu"
 let gPooType                    = "pool2d"
 let gSoftmaxType                = "softmax"
 let gReshapeType                = "reshape"
 let gConvAddType                = "conv_add"
+let gDepthConvType              = "depthwise_conv2d"
+let gPriorBoxType               = "prior_box"
+let gTransposeType              = "transpose"
+let gConcatType                 = "concat"
+let gBoxcoderType               = "box_coder"
+let gMulticlassNMSType          = "multiclass_nms"
+let gConvBnReluType             = "conv_bn_relu"
+let gDwConvBnReluType           = "depth_conv_bn_relu"
+let gPreluType                  = "prelu"
+let gConv2dTransposeType        = "conv2d_transpose"
+let gBilinearInterpType         = "bilinear_interp"
+let gSplit                      = "split"
+let gShape                      = "shape"
+let gFlatten                    = "flatten"
+let gConvAddPreluType           = "conv_add_prelu"
+let gConvAddAddPreluType        = "conv_add_add_prelu"
+let gElementwiseAddPreluType = "elementwise_add_prelu"
 
 
 let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
                gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
                gReluType                    : (inputs: ["X"], outputs: ["Out"]),
-               gElementwiseAdd              : (inputs: ["X"], outputs: ["Out"]),
+               gElementwiseAddType          : (inputs: ["X"], outputs: ["Out"]),
                gFeedType                    : (inputs: ["X"], outputs: ["Out"]),
                gFetchType                   : (inputs: ["X"], outputs: ["Out"]),
                gConvAddBatchNormReluType    : (inputs: ["Input"], outputs: ["Out"]),
                gPooType                     : (inputs: ["X"], outputs: ["Out"]),
                gSoftmaxType                 : (inputs: ["X"], outputs: ["Out"]),
                gReshapeType                 : (inputs: ["X"], outputs: ["Out"]),
-               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"])]
+               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"]),
+               gDepthConvType               : (inputs: ["Input"], outputs: ["Output"]),
+               gConcatType                  : (inputs: ["X"], outputs: ["Out"]),
+               gBoxcoderType                : (inputs: ["PriorBox", "PriorBoxVar", "TargetBox"], outputs: ["OutputBox"]),
+               gTransposeType               : (inputs: ["X"], outputs: ["Out"]),
+               gConvBnReluType              : (inputs: ["Input"], outputs: ["Out"]),
+               gDwConvBnReluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gMulticlassNMSType           : (inputs: ["BBoxes", "Scores"], outputs: ["Out"]),
+               gPriorBoxType                : (inputs: ["Input", "Image"], outputs: ["Boxes", "Variances"]),
+               gPreluType                   : (inputs: ["X"], outputs: ["Out"]),
+               gConv2dTransposeType         : (inputs: ["Input"], outputs: ["Output"]),
+               gBilinearInterpType          : (inputs: ["X"], outputs: ["Out"]),
+               gSplit                       : (inputs: ["X"], outputs: ["Out"]),
+               gShape                       : (inputs: ["Input"], outputs: ["Out"]),
+               gFlatten                     : (inputs: ["X"], outputs: ["Out"]),
+               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gConvAddAddPreluType         : (inputs: ["Input"], outputs: ["Out"]),
+               gElementwiseAddPreluType  :  (inputs: ["X"], outputs: ["Out"])
+              ]
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
index 3761dad60f0f8b20e3f95168445317a3e627ada9..9fc20f8a597d39d3b628c5e1033f9c5cceac45ed 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
@@ -1,62 +1,66 @@
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-// http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
 
 import Foundation
 
 class BatchNormParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
-            inputBias = try BatchNormParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-            inputMean = try BatchNormParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-            inputScale = try BatchNormParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-            inputVariance = try BatchNormParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-            epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-            momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
-            is_test = try BatchNormParam.getAttr(key: "is_test", attrs: opDesc.attrs)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
+      if input.transpose != [0, 2, 3, 1] {
+        fatalError("batch norm only accepts NHWC")
+      }
+      output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
+      bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
+      mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
+      scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
+      variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
+      epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
     }
-    let input: Texture<P>
-    var output: Texture<P>
-    let inputBias: Tensor<ParamPrecisionType>
-    let inputMean: Tensor<ParamPrecisionType>
-    let inputScale: Tensor<ParamPrecisionType>
-    let inputVariance: Tensor<ParamPrecisionType>
-    let epsilon: Float
-    let momentum: Float
-    let is_test: Bool
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let bias: Tensor<P>
+  let mean: Tensor<P>
+  let scale: Tensor<P>
+  let variance: Tensor<P>
+  let epsilon: Float
+  let momentum: Float
 }
 
 class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam<P>>, Runable, Creator, InferShaperable{
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    typealias OpType = BatchNormOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+  typealias OpType = BatchNormOp<P>
+
+  func inferShape() {
+    para.output.dim = para.input.dim
+  }
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
 }
-
-
-
-
-
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8db64ac3a473fe59e7821f11abeb3437c337459d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BilinearInterpParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+      out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+      fatalError()
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let out_h: Int
+  let out_w: Int
+}
+
+class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, BilinearInterpParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = BilinearInterpOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+//    print(outputArray)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d5f0c53128bbc2f0b5e94d2075eecdef0fcc6
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BoxcoderParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
+      priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
+      targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
+      output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
+      codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
+      boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    assert(priorBox.tensorDim.cout() == 2)
+    assert(priorBoxVar.tensorDim.cout() == 2)
+    assert(targetBox.tensorDim.cout() == 3)
+    assert(output.tensorDim.cout() == 3)
+    assert(priorBox.transpose == [0, 1, 2, 3])
+    assert(priorBoxVar.transpose == [0, 1, 2, 3])
+    assert(targetBox.transpose == [0, 1, 2, 3])
+    assert(codeType == "decode_center_size") // encode_center_size is not implemented
+    assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
+  }
+  let priorBox: Texture<P>
+  let priorBoxVar: Texture<P>
+  let targetBox: Texture<P>
+  var output: Texture<P>
+  let codeType: String
+  let boxNormalized: Bool
+}
+
+class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = BoxcoderOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
+    let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
+    let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
+    let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(" prior box var ")
+    print(pbv.strideArray())
+    print(" target box ")
+    print(tb.strideArray())
+    print(" prior box ")
+    print(pb.strideArray())
+    print(" output ")
+    print(out.strideArray())
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8ba74a1c31456d7cb6e9ad67974bc02055313958
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
@@ -0,0 +1,75 @@
+//
+//  CNNConvAddBatchNormReluOp.swift
+//  paddle-mobile
+
+import Foundation
+
+class CNNMPSConvTestParam: TestParam {
+    var outputTexture: MTLTexture?
+    var metalParam: MetalConvParam
+    let filterPointer: UnsafeMutableRawPointer
+    let biasePointer: UnsafeMutablePointer<Float>
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inMetalParam: MetalConvParam, inFilter: [Float], inBiase: [Float], inFilterSize: (width: Int, height: Int, channel: Int)) {
+        metalParam = inMetalParam
+        filterPointer = UnsafeMutableRawPointer.init(mutating: inFilter)
+        biasePointer = UnsafeMutablePointer.init(mutating: inBiase)
+        filterSize = inFilterSize
+    }
+}
+
+@available(iOS 10.0, *)
+class CNNMPSConvOp<P: PrecisionType>: Operator<CNNConvKernel<P>, CNNConvParam<P>>, Runable, Creator, InferShaperable, Fusion {
+    
+    typealias OpType = CNNMPSConvOp<P>
+
+    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+        fatalError()
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+    }
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode-->Node.init(inType: gElementwiseAdd);
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gMPSCNNConvType
+    }
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..d5320136190bb1b7af124b762b719921c1d25200
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConcatParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      guard let xlist = opDesc.inputs["X"] else {
+        fatalError()
+      }
+      for x in xlist {
+        guard let variant = inScope[x], let v = variant as? Texture<P> else {
+          fatalError()
+        }
+        if transpose.count == 0 {
+          transpose = v.transpose
+        }
+        if v.transpose != transpose {
+          fatalError()
+        }
+       
+        input.append(v)
+      }
+      axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+      output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  var input: [Texture<P>] = []
+  var output: Texture<P>
+  var transpose: [Int] = []
+  let axis: Int
+}
+
+class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ConcatOp<P>
+
+  func inferShape() {
+    //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
+    //        para.output.dim = Dim.init(inDim: dim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e5bded65a1a8944d337fea65995af79cab580105
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddAddPreluOp<P: PrecisionType>: Operator<ConvAddAddPreluKernel<P>, ConvAddAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddAddPreluType
+  }
+  
+  static func needCheck() -> [(Int, String)] {
+    return [(2, "Y"), (2, "X")]
+  }
+  
+  
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
index f24e25b054f02c7b8f12015697fd61e9a2005ef8..43935b65d1442d7c2e1ca3db49168140569c433f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -14,119 +14,117 @@
 
 import Foundation
 
+
 class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
-            
-            groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
-            bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
-            scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
-            mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
-            y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      
+      filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      
+      groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+      bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+      
+      scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+      mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+      y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
     }
-    
-    let input: Texture<P>
-    
-    let variance: Tensor<ParamPrecisionType>
-    let bias: Tensor<ParamPrecisionType>
-    let mean: Tensor<ParamPrecisionType>
-    let scale: Tensor<ParamPrecisionType>
-    let y: Tensor<ParamPrecisionType>
-    let filter: Tensor<ParamPrecisionType>
-    let epsilon: Float32
-    var newScale: MTLBuffer?
-    var newBiase: MTLBuffer?
-    
-    var output: Texture<P>
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
+  }
+  
+  let input: Texture<P>
+  
+  let variance: Tensor<ParamPrecisionType>
+  let bias: Tensor<ParamPrecisionType>
+  let mean: Tensor<ParamPrecisionType>
+  let scale: Tensor<ParamPrecisionType>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let epsilon: Float32
+  var newScale: MTLBuffer?
+  var newBiase: MTLBuffer?
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
 }
 
 class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKernel<P>, ConvAddBatchNormReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    typealias OpType = ConvAddBatchNormReluOp<P>
+  
+  typealias OpType = ConvAddBatchNormReluOp<P>
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
     
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
-
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
     }
-    
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAdd)
-            --> Node.init(inType: gBatchNormType)
-            --> Node.init(inType: gReluType)
-        return beginNode
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType)
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddBatchNormReluType
+  }
+  
+  func delogOutput() {
+    print(" conv add batchnorm relu output ")
+    print(para.output.toTensor().strideArray())
+    //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+    //        para.filter.logDataPointer(header: "filter data pointer: ")
+    //        print("filter: \(para.filter)")
     
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
+    //        print("biase: \(para.y)")
+    //        print("padding: \(para.paddings)")
+    //        print("stride: \(para.stride)")
     
-    static func fusionType() -> String {
-        return gConvAddBatchNormReluType
-    }
+    //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+    //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+    //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
     
-    func delogOutput() {
-        
-//        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
-//        para.filter.logDataPointer(header: "filter data pointer: ")
-//        print("filter: \(para.filter)")
-        
-//        print("biase: \(para.y)")
-//        print("padding: \(para.paddings)")
-//        print("stride: \(para.stride)")
-        
-//        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
-//        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
-//        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
-        
-        let output = para.output.metalTexture.floatArray { (p: P) -> P in
-            return p
-        }
-//
-        writeToLibrary(fileName: "output_112x112x32_2", array: output)
-        print(" write done")
-        
-//        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
-    }
+    //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
index 40069f6550ea00e986926f40c5fc2a2d4bf22a83..5e184844d886beb19ac5ff297f8a270af8a076fa 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -15,79 +15,102 @@
 import Foundation
 
 class ConvAddParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      
+      y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
     }
-    
-    let input: Texture<P>
-    let y: Tensor<ParamPrecisionType>
-    let filter: Tensor<ParamPrecisionType>
-    
-    var output: Texture<P>
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
 }
 
 class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
-    static func fusionNode() -> Node {
-        let beginNode = Node.init(inType: gConvType)
-        _ = beginNode
-            --> Node.init(inType: gElementwiseAdd)
-        return beginNode
-    }
-    
-    static func change() -> [String : [(from: String, to: String)]] {
-        return [:]
-    }
-    
-    static func fusionType() -> String {
-        return gConvAddType
-    }
+  typealias OpType = ConvAddOp<P>
+
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddType
+  }
+  
+  func inferShape() {
     
-    typealias OpType = ConvAddOp<P>
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
     
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
     }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func delogOutput() {
+//    print("op \(type): ")
+//    print(" padding: ")
+//    print(para.paddings)
+//    print("stride: ")
+//    print(para.stride)
+//    print("dilations: ")
+//    print(para.dilations)
+//    print(" para input dim: ")
+//    print(para.input.dim)
+//    print(" para filter dim: ")
+//    print(para.filter.dim)
+//    print(" para output dim: ")
+//    print(para.output.dim)
+//    print(" biase: ")
+//    let biase: [Float32] = para.y.buffer.array()
+//    print(biase)
     
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..0a0fcc7d7934e1c3c7a48f6925105b02ec6d8fc9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
@@ -0,0 +1,101 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddPreluType
+  }
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..959fe44b98dabec2b39fdfdb438d482d720caa61
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvBNReluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      
+      groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+      bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+      scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+      mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  
+  let variance: Tensor<ParamPrecisionType>
+  let bias: Tensor<ParamPrecisionType>
+  let mean: Tensor<ParamPrecisionType>
+  let scale: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let epsilon: Float32
+  var newScale: MTLBuffer?
+  var newBiase: MTLBuffer?
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvBNReluOp<P>
+  
+  func inputs() -> [Variant] {
+    return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
+  }
+  
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvBnReluType
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
index 29b0c4246e728dbc3d3b865a189c7063ac1bbdcf..e82eb1f4753f0ebfdb5a949c85181a0ae52ea2da 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
@@ -15,74 +15,67 @@
 import Foundation
 
 class ConvParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
-            input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
-            output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
-            stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
-            groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
-            
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      
+    } catch let error {
+      throw error
     }
-    
-    let input: Texture<P>
-    let filter: Tensor<ParamPrecisionType>
-    var output: Texture<P>
-    let stride: [Int32]
-    let paddings: [Int32]
-    let dilations: [Int32]
-    let groups: Int
+  }
+  
+  let input: Texture<P>
+  let filter: Tensor<ParamPrecisionType>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
 }
 
 class ConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
-    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            try super.init(device: device, opDesc: opDesc, inScope: inScope)
-        } catch let error {
-            throw error
-        }
-        
-    }
-    func inferShape() {
-        let inDims = para.input.dim
-        let filterDim = para.filter.dim
-        let strides = para.stride
-        let paddings = para.paddings
-        let dilations = para.dilations
-        
-        var outDim = [inDims[0]]
-        for i in 0..<strides.count {
-            let dilation: Int = Int(dilations[i])
-            let filterSize: Int = filterDim[i + 1]
-            let inputSize: Int = inDims[i + 1]
-            let padding: Int = Int(paddings[i])
-            let stride: Int = Int(strides[i])
-            let dKernel = dilation * (filterSize - 1) + 1
-            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
-            outDim.append(outputSize)
-        }
-        outDim.append(filterDim[0])
-        para.output.dim = Dim.init(inDim: outDim)
-    }
+  typealias OpType = ConvOp<P>
+
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
     
-    typealias OpType = ConvOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
     }
-    
-    func delogOutput() {
-        print("conv output : ")
-        print(para.output.metalTexture)
-//        let _: Float16? = para.output.metalTexture.logDesc()
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func delogOutput() {
+    print("conv output : ")
+    print(para.output.toTensor().strideArray())
+    //        let _: Float16? = para.output.metalTexture.logDesc()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..9ec6f7bd60260b5808e469129e9c292ff9837f7c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvTransposeParam<P: PrecisionType>: ConvParam<P> {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      try super.init(opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+}
+
+class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTransposeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ConvTransposeOp<P>
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+  
+    print(" \(type) output: ")
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
+      print(output.strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ec76eecf1fc9736d9dff6a4cf0d69a314a9b1e0d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
+
+  typealias OpType = DepthConvOp<P>
+
+  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      try super.init(device: device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8575cfd88c7ddea2f007cad21507b4620c87d3e2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvBNReluOp<P>
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gDepthConvType)
+    _ = beginNode
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gDwConvBnReluType
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
index 5ed36f86d79ffd639dc2ba76da74d24a532b1bd1..ae040dd65f74fc222275bc579338107f2ea188fd 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -15,33 +15,80 @@
 import Foundation
 
 class ElementwiseAddParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
-            inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
-            
-            output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
     }
-    let input: Texture<P>
-    let inputY: Tensor<P>
-    var output: Texture<P>
-    let axis: Int
+    do {
+      inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch _ {
+      let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+      let device = inputX.metalTexture!.device
+      inputY = Texture.init(device: device, inDim: tensorY.dim)
+      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+    }
+    
+//    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+//      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+//      if computePrecision == .Float32 {
+//        super.init(device: device, inFunctionName: "elementwise_add")
+//      } else if computePrecision == .Float16 {
+//        super.init(device: device, inFunctionName: "elementwise_add_half")
+//      } else {
+//        fatalError()
+//      }
+//    }
+    
+    var offset = axis
+    if axis == -1 {
+      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    }
+    for i in 0..<(inputY.tensorDim.cout()) {
+      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    }
+  }
+  
+  var inputX: Texture<P>
+  var inputY: Texture<P>
+  var output: Texture<P>
+  var axis: Int
 }
 
 class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, ElementwiseAddParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
+  typealias OpType = ElementwiseAddOp<P>
+  
+  func inferShape() {
+//    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output)
     
-    typealias OpType = ElementwiseAddOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
     }
+  }
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..333303e9bb7c1224ff50d69b5523edabe0fc81a6
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    do {
+      inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch _ {
+      let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+      let device = inputX.metalTexture!.device
+      inputY = Texture.init(device: device, inDim: tensorY.dim)
+      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+    }
+    
+    //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+    //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    //      if computePrecision == .Float32 {
+    //        super.init(device: device, inFunctionName: "elementwise_add")
+    //      } else if computePrecision == .Float16 {
+    //        super.init(device: device, inFunctionName: "elementwise_add_half")
+    //      } else {
+    //        fatalError()
+    //      }
+    //    }
+    
+    var offset = axis
+    if axis == -1 {
+      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    }
+    for i in 0..<(inputY.tensorDim.cout()) {
+      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    }
+  }
+  
+  let mode: String
+  let alpha: Tensor<P>
+  var inputX: Texture<P>
+  var inputY: Texture<P>
+  var output: Texture<P>
+  var axis: Int
+}
+
+class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gElementwiseAddType)
+    _ = beginNode
+      --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gElementwiseAddPreluType
+  }
+  
+  typealias OpType = ElementwiseAddPreluOp<P>
+  
+  func inferShape() {
+    //    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output)
+    
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
index c81d9e786c91408d2412b30eaec089904df75751..382ea58b844b25bb855ed7cdc155a860bca45da5 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
@@ -15,54 +15,53 @@
 import Foundation
 
 class FeedParam<P: PrecisionType>: OpParam{
-    var output: Texture<P>
-    var input: InputTexture {
-        return scope.input() as! InputTexture
+  var output: Texture<P>
+  var input: InputTexture {
+    return scope.input() as! InputTexture
+  }
+  let scope: Scope
+  
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    scope = inScope
+    do {
+      output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
     }
-    let scope: Scope
-    
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        scope = inScope
-        do {
-            output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
-        }
-    }
-    
-    typealias ParamPrecisionType = P
+  }
+  
+  typealias ParamPrecisionType = P
 }
 
 class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<P>>, Runable, Creator, InferShaperable {
-    typealias OpType = FeedOp<P>
-    
-    func inferShape() {
-        //        print("feed  input: \(para.input.expectDim)")
-        print("feed output: \(para.output.dim)")
-        //        para.output.dim =
-        //        para.output.dim = para.input.expectDim
-    }
-    
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-        
-//        let resizeKernel = ResizeKernel<P>.init(device: device)
-//        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
-//        do {
-//            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
-//        } catch let error {
-//            throw error
-//        }
+  typealias OpType = FeedOp<P>
+
+  func inferShape() {
+    //        print("feed  input: \(para.input.expectDim)")
+    print("feed output: \(para.output.dim)")
+    //        para.output.dim =
+    //        para.output.dim = para.input.expectDim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
     
-    func delogOutput() {
-//        para.input.mtlTexture.logDesc()
-//        let _: P? = para.input.mtlTexture.logDesc(header: "feed input: ", stridable: true)
-//        let _: P? = para.output.metalTexture.logDesc(header: "feed output: ", stridable: false)
-    }
+    //        let resizeKernel = ResizeKernel<P>.init(device: device)
+    //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
+    //        do {
+    //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
+    //        } catch let error {
+    //            throw error
+    //        }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
index 2964b89e5ddabbbbd4f2df032efa5ef2db82ec96..ade5b09099b69f4784b33a3b108cfcfe1aa1ea7f 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
@@ -15,40 +15,73 @@
 import Foundation
 
 class FetchParam<P: PrecisionType>: OpParam{
-    var output: Texture<P>
-    let input: Texture<P>
-    let scope: Scope
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        scope = inScope
-        do {
-            input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = input
-        } catch let error {
-            throw error
-        }
+  var output: FetchHolder
+  let input: Texture<P>
+  let scope: Scope
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    scope = inScope
+    do {
+      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = FetchHolder.init(inCapacity: input.numel(), inDim: input.tensorDim.dims)
+      scope.setOutput(output: output)
+    } catch let error {
+      throw error
     }
-    
-    typealias ParamPrecisionType = P
+  }
+  
+  typealias ParamPrecisionType = P
 }
 
 class FetchKernel<P: PrecisionType>: Kernel, Computable {
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
-    
-    required init(device: MTLDevice, param: FetchParam<P>) {
-        super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: FetchParam<P>) {
+    param.output.initBuffer(device: device)
+    if computePrecision == .Float16 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch_half")
+      } else {
+//        fatalError(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder_half")
+        print(" not support ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch")
+      } else {
+        print(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder")
+//        fatalError(" not support ")        
+      }
+    } else {
+      fatalError(" not support ")
     }
+  }
 }
 
-class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable{
-    func inferShape() {
-        print(para.input.dim)
-    }
-    
-    typealias OpType = FetchOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        scope.setOutput(output: para.output)
+class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
+  
+  typealias OpType = FetchOp<P>
+
+  func inferShape() {
+    print(para.input.dim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4fc5f222932ce98c4bf3e29bdf6cd8c666f5f9f1
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class FlattenParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let axis: Int
+}
+
+
+class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = FlattenOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f58358761f820809685510fa4e9b5ff237567b3c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import Foundation
+
+public protocol TestParam {
+}
+
+public protocol Testable {
+  associatedtype TestParamType: TestParam
+  func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
+  init(device: MTLDevice, testParam: TestParamType)
+}
+
+
+protocol Computable {
+  associatedtype ParamType: OpParam
+  func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
+  init(device: MTLDevice, param: ParamType)
+}
+
+protocol KernelProtocol {
+  var pipline: MTLComputePipelineState { get set }
+  var functionName: String { get set }
+  
+}
+
+open class Kernel {
+  let pipline: MTLComputePipelineState
+  let functionName: String
+  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
+    pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
+    functionName = inFunctionName
+  }
+}
+
+open class CusomKernel: Kernel {
+  public struct Shape {
+    public let width: Int
+    public let height: Int
+    public let channel: Int
+    public init(inWidth: Int, inHeight: Int, inChannel: Int){
+      width = inWidth
+      height = inHeight
+      channel = inChannel
+    }
+  }
+  public let outputTexture: MTLTexture
+  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.textureType = .type2D
+    textureDesc.width = outputDim.width
+    textureDesc.height = outputDim.height
+    textureDesc.depth = (outputDim.channel + 3) / 4
+    
+    if computePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if computePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    } else {
+      fatalError()
+    }
+    
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.storageMode = .shared
+    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+    
+    super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+  }
+  
+  public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(inputTexuture, index: 0)
+    encoder.setTexture(outputTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+    encoder.endEncoding()
+  }
+  
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
index bae452dec331957ceda5a6f503802352f63a6dbe..dad8d0c6ac2e5a93273573473c700179f8b90a37 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
@@ -15,53 +15,39 @@
 import Foundation
 
 class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
-    var newScale: MTLBuffer
-    var newBias: MTLBuffer
-    
-    required init(device: MTLDevice, param: BatchNormParam<P>) {
-        guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
-            fatalError()
-        }
-        guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
-            fatalError()
-        }
-        self.newScale = newScale
-        self.newBias = newBias
-        
-        super.init(device: device, inFunctionName: "batchnorm")
-        
-        let varianceBuffer : MTLBuffer = param.inputVariance.buffer
-        
-        var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
-        let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
-            invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
-        }
-        
-        let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
-        let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
-        let scale : MTLBuffer = param.inputScale.buffer
-        let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
-        let bias : MTLBuffer = param.inputBias.buffer
-        let biasContents = bias.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
-            newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
-            newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
-        }
+  required init(device: MTLDevice, param: BatchNormParam<P>) {
+    let count = param.variance.dim.numel()
+    let varianceP = param.variance.data.pointer
+    let meanP = param.mean.data.pointer
+    let scaleP = param.scale.data.pointer
+    let biasP = param.bias.data.pointer
+    for i in 0..<count {
+      let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
+      biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
+      scaleP[i] = invStd * scaleP[i]
     }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        print("BatchNorm compute")
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBuffer(newScale, offset: 0, index: 0)
-        encoder.setBuffer(newBias, offset: 0, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+
+    param.bias.initBuffer(device: device, precision: computePrecision)
+    param.scale.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "batchnorm")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "batchnorm_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
     }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
+    encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..fca5719553038732b1646fb8b15885bd03bd5624
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
@@ -0,0 +1,91 @@
+//
+//  BatchNormRelu.swift
+//  paddle-mobile
+//
+//  Created by zhangxinjun on 2018/8/23.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+import Foundation
+
+
+class BatchNormReluParam<P: PrecisionType>: BatchNormParam<P> {
+    
+}
+
+class BatchNormReluKernel<P: PrecisionType>: Kernel, Computable{
+    
+    
+    typealias ParamType = BatchNormReluParam<P>
+    var newScale: MTLBuffer
+    var newBias: MTLBuffer
+    
+    required init(device: MTLDevice, testParam: BatchNormReluTestParam) {
+        
+        newScale = testParam.newScaleBuffer
+        newBias = testParam.newBiaseBuffer
+        
+        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
+    }
+    
+    required init(device: MTLDevice, param: BatchNormReluParam<P>) {
+        guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
+            fatalError()
+        }
+        guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
+            fatalError()
+        }
+        self.newScale = newScale
+        self.newBias = newBias
+        
+        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
+        
+        
+        let varianceBuffer : MTLBuffer = param.inputVariance.buffer
+        
+        var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
+        let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
+            invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
+        }
+        
+        let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
+        let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
+        let scale : MTLBuffer = param.inputScale.buffer
+        let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
+        let bias : MTLBuffer = param.inputBias.buffer
+        let biasContents = bias.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
+            newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
+            newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
+        }
+    }
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        encoder.setTexture(param.input as? MTLTexture, index: 0)
+        encoder.setTexture(param.output as? MTLTexture, index: 1)
+        encoder.setBuffer(newScale, offset: 0, index: 1)
+        encoder.setBuffer(newBias, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output as! MTLTexture)
+        encoder.endEncoding()
+    }
+    
+    func testCompute(commandBuffer: MTLCommandBuffer, testParam: BatchNormReluTestParam) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        encoder.setTexture(testParam.inputTexture, index: 0)
+        encoder.setTexture(testParam.outputTexture, index: 1)
+        encoder.setBuffer(newScale, offset: 0, index: 0)
+        encoder.setBuffer(newBias, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+        encoder.endEncoding()
+    }
+    
+    
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7f3e7433760cc1fa4d093b08027bce7c79172532
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct BilinearInterpMetalParam {
+  var ratio_h: Float32
+  var ratio_w: Float32
+}
+
+class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    var ratio_h: Float32 = 0
+    var ratio_w: Float32 = 0
+    if param.output.tensorDim.dims[2] > 1 {
+      ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+    }
+    if param.output.tensorDim.dims[3] > 1 {
+      ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
+    }
+    var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+    encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: BilinearInterpParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "bilinear_interp_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "bilinear_interp_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..c084d9b28e1dc7019a14d3ae317ddf8a64547830
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct BoxcoderMetalParam {
+}
+
+class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.priorBox.metalTexture, index: 0)
+    encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
+    encoder.setTexture(param.targetBox.metalTexture, index: 2)
+    encoder.setTexture(param.output.metalTexture, index: 3)
+    var bmp = BoxcoderMetalParam.init()
+    encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: BoxcoderParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "boxcoder_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "boxcoder_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..14a5bd521455632c8a67e4c1a8ebdedc6c460aa5
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
@@ -0,0 +1,176 @@
+//
+//  CNNConvKernel.swift
+//  paddle-mobile
+//
+
+import Foundation
+import Metal
+import Accelerate
+import MetalPerformanceShaders
+
+@available(iOS 10.0, *)
+class WeightsDataSource: NSObject, MPSCNNConvolutionDataSource  {
+    
+    let desc: MPSCNNConvolutionDescriptor
+    let weight:UnsafeMutableRawPointer
+    let bias:UnsafeMutablePointer<Float>
+    
+    
+    
+    init(inDesc: MPSCNNConvolutionDescriptor, inWeight: UnsafeMutableRawPointer, inBias: UnsafeMutablePointer<Float>) {
+        desc = inDesc
+        weight = inWeight
+        bias = inBias
+    }
+    
+    
+    func dataType() -> MPSDataType {
+        return .float32
+    }
+    
+    func descriptor() -> MPSCNNConvolutionDescriptor {
+        return desc
+    }
+    
+    func weights() -> UnsafeMutableRawPointer {
+        return self.weight
+    }
+    
+    func biasTerms() -> UnsafeMutablePointer<Float>? {
+        return self.bias
+    }
+    
+    func load() -> Bool {
+        return true
+    }
+    
+    func purge() {
+    }
+    
+    func label() -> String? {
+        return "Conv"
+    }
+    
+    
+}
+
+@available(iOS 10.0, *)
+class CNNConvParam<P: PrecisionType>: OpParam{
+    
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            filter = try CNNConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try CNNConvParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try CNNConvParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try CNNConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try CNNConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            // 暂时不用关心
+            dilations = try CNNConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            // 暂时不用关心
+            groups = try CNNConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            
+            variance = try CNNConvParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            // bias
+            y = try CNNConvParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    var input: Texture<P>
+    let variance: Tensor<ParamPrecisionType>
+    let y: Tensor<ParamPrecisionType>
+    let filter: Tensor<ParamPrecisionType>
+    var output: Texture<P>
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
+}
+
+@available(iOS 10.0, *)
+class CNNConvKernel<P: PrecisionType>: Kernel, Computable {
+    
+    typealias ParamType = CNNConvParam<P>
+    
+    var mpsImageCreator: MpsImageCreator<P>?
+    var activation:MPSCNNNeuron?
+    var conv:MPSCNNConvolution?
+    var weightDataSource:WeightsDataSource?
+    var param: CNNConvParam<P>?
+    var device: MTLDevice?
+    
+    
+    required init(device:MTLDevice, testParam:CNNMPSConvTestParam) {
+        self.device = device
+        
+        let desc = MPSCNNConvolutionDescriptor(kernelWidth: testParam.filterSize.width, kernelHeight: testParam.filterSize.height, inputFeatureChannels: testParam.filterSize.channel, outputFeatureChannels: testParam.filterSize.channel, neuronFilter: activation)
+        
+        desc.strideInPixelsX = Int(testParam.metalParam.offsetX)
+        desc.strideInPixelsY = Int(testParam.metalParam.offsetY)
+        
+        
+        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:testParam.filterPointer, inBias:testParam.biasePointer)
+        
+        if #available(iOS 11.0, *) {
+            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
+        } else {
+            // Fallback on earlier versions
+        }
+        
+        super.init(device: device, inFunctionName: "")
+    }
+
+    required init(device:MTLDevice, param:CNNConvParam<P>) {
+        
+        self.device = device
+
+        let inChannels: Int
+        let outChannels: Int
+        
+        if param.y.dim.cout() == 4 {
+            inChannels = (param.y.dim[3])
+            outChannels = inChannels
+        } else {
+            inChannels = 0
+            outChannels = inChannels
+        }
+        
+        let desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.width, kernelHeight: param.filter.height, inputFeatureChannels: inChannels, outputFeatureChannels: outChannels, neuronFilter: activation)
+        
+        desc.strideInPixelsX = Int(param.stride[0])
+        desc.strideInPixelsY = Int(param.stride[1])
+        
+        
+        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:param.filter.data.pointer as! UnsafeMutablePointer<Float>, inBias: param.y.data.pointer as! UnsafeMutablePointer<Float>)
+        
+        if #available(iOS 11.0, *) {
+            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
+        } else {
+            // Fallback on earlier versions
+        }
+        
+        super.init(device: device, inFunctionName: "")
+    }
+
+    func compute(commandBuffer: MTLCommandBuffer, param: CNNConvParam<P>) throws {
+        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        
+        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
+        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+        
+        param.input = outputImage.texture as! Texture<P>
+    }
+    
+    func testCompute(commandBuffer: MTLCommandBuffer, testParam: CNNMPSConvTestParam) throws {
+        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        
+        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
+        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+        
+        testParam.outputTexture = outputImage.texture
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
new file mode 100644
index 0000000000000000000000000000000000000000..25f0a21bfff420566d06a59dca626805dd0ce6e0
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: ConcatParam<P>) {
+        super.init(device: device, inFunctionName: "concat")
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..81ef46c0b3e919615d07f667851007e95b02d54f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
@@ -0,0 +1,147 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ConcatTestParam: TestParam {
+  var input: [MTLTexture]
+  var output: MTLTexture
+  var dims: [[Int]]
+  var axis: Int
+  var odim: [Int]
+}
+
+struct ConcatMetalParam {
+  var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+  var axis: Int32 = 0
+  var offset: Int32 = 0
+  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
+}
+
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+  var v = "normal"
+  var pm = ConcatMetalParam.init()
+  func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+    
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    let num = param.input.count
+    for i in 0..<num {
+      encoder.setTexture(param.input[i].metalTexture, index: i)
+    }
+    encoder.setTexture(param.output.metalTexture, index: num)
+    if v == "normal" {
+      encoder.setTexture(param.output.metalTexture, index: num + 1)
+    }
+    encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+
+  required init(device: MTLDevice, param: ConcatParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
+    let orank = param.output.tensorDim.cout()
+    let num = param.input.count
+    assert(num <= 6)
+    var axis = 4 - param.output.tensorDim.cout() + param.axis
+    for i in 0..<4 {
+      if param.transpose[i] == axis {
+        axis = i
+        break
+      }
+    }
+    pm.axis = Int32(axis)
+    pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
+    pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
+    var vdim: [Int] = [0, 0, 0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = param.input[i].dim[axis]
+    }
+    if orank == 4 {
+      if axis == 1 {
+        v = "y"
+      } else if axis == 2 {
+        v = "x"
+      } else {
+        if (param.output.dim[0] == 1) && axis == 3 {
+          var vz = true
+          for i in 0..<num {
+            if vdim[i] % 4 != 0 {
+              vz = false
+              break
+            }
+          }
+          if vz {
+            v = "z"
+            for i in 0..<num {
+              vdim[i] = vdim[i] / 4
+            }
+          }
+        }
+      }
+    } else if orank == 3 {
+      if axis == 2 {
+        v = "y"
+      } else if axis == 3 {
+        v = "x"
+      } else if axis == 1 {
+        var vz = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vz = false
+            break
+          }
+        }
+        if vz {
+          v = "z"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
+          }
+        }
+      }
+    } else {
+      if axis == 2 {
+        v = "y"
+      } else if axis == 3 {
+        var vx = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vx = false
+            break
+          }
+        }
+        if vx {
+          v = "x"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
+          }
+        }
+      }
+    }
+    pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  required init(device: MTLDevice, testParam: ConcatTestParam) {
+    super.init(device: device, inFunctionName: "concat")
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..dfd9a74291306337c9183595d02db7f8d25e63a9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index 0ffe90272fe36fa30d58c7c6bd1e287d49f0e92a..66324dd47086fd7c1ccffb674c0f8b8623416e0d 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -15,124 +15,165 @@
 import Foundation
 
 struct ConvAddBatchNormReluTestParam: TestParam {
-    let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
-    var metalParam: MetalConvParam
-    let filterBuffer: MTLBuffer
-    let biaseBuffer: MTLBuffer
-    let newScaleBuffer: MTLBuffer
-    let newBiaseBuffer: MTLBuffer
-    let filterSize: (width: Int, height: Int, channel: Int)
-    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-        inputTexture = inInputTexture
-        outputTexture = inOutputTexture
-        metalParam = inMetalParam
-        filterBuffer = inFilterBuffer
-        biaseBuffer = inBiaseBuffer
-        newScaleBuffer = inNewScaleBuffer
-        newBiaseBuffer = inNewBiaseBuffer
-        filterSize = inFilterSize
-    }
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  var metalParam: MetalConvParam
+  let filterBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
+  let filterSize: (width: Int, height: Int, channel: Int)
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+    inputTexture = inInputTexture
+    outputTexture = inOutputTexture
+    metalParam = inMetalParam
+    filterBuffer = inFilterBuffer
+    biaseBuffer = inBiaseBuffer
+    newScaleBuffer = inNewScaleBuffer
+    newBiaseBuffer = inNewBiaseBuffer
+    filterSize = inFilterSize
+  }
 }
 
 class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-    required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
-        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
-        } else if testParam.filterSize.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
-        } else {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
-        }
+  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+    } else if testParam.filterSize.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
     }
+  }
+  
+  var metalParam: MetalConvParam!
+  
+  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
     
-    var metalParam: MetalConvParam!
-
-    required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
-        
-        if param.filter.width == 1 && param.filter.height == 1 {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
-        } else if param.filter.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
-        } else {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
-        }
-        
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-
-        param.variance.initBuffer(device: device)
-        param.mean.initBuffer(device: device)
-        param.scale.initBuffer(device: device)
-        param.bias.initBuffer(device: device)
-        
-        let offsetX = param.filter.width/2 - Int(param.paddings[0])
-        let offsetY = param.filter.height/2 - Int(param.paddings[1])
-        
-        print("offset x: \(offsetX)")
-        print("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
-        
-        var invs: [P] = []
-        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-        
-        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {            
-            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-            invs.append(P(inv))
-        }
-        
-        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-        
-        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-            newScale[i] = invs[i] * scaleContents[i]
-            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
-        }
-        param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
-        param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
-        
-        newScale.deinitialize(count: param.scale.buffer.length)
-        newScale.deallocate()
-        
-        newBiase.deinitialize(count: param.bias.buffer.length)
-        newBiase.deallocate()
+    if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else {
+      fatalError()
     }
     
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    
+    print("offset x: \(offsetX)")
+    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
+    }
+    
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+    
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+      newScale[i] = invs[i] * scaleContents[i]
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
     }
     
-    public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            fatalError()
-        }
-        
-        encoder.setTexture(param.inputTexture, index: 0)
-        encoder.setTexture(param.outputTexture, index: 1)
-        var inMetalParam = param.metalParam
-        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-        encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
-        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
-        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
-        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-        encoder.endEncoding()
+//    var newScaleFP16: UnsafeMutableRawPointer
+//
+//    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
+    
+    
+//    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
+    
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float32 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float16 {
+      
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
     }
+    
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
+    
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
+    
+    newBiase.deinitialize(count: param.bias.buffer.length)
+    newBiase.deallocate()
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
+    }
+    
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
index 81f3aacba8dded3341237e05f9afbc1e04f70596..d5aa98d2606ceda5cbcf0f3f4c1fc0ed2adeed25 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -15,33 +15,73 @@
 import Foundation
 
 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    required init(device: MTLDevice, param: ConvAddParam<P>) {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_3x3_half")
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        super.init(device: device, inFunctionName: "conv_add_5x1_half")
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x5_half")
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
         super.init(device: device, inFunctionName: "conv_add_1x1")
-        let offsetX = param.filter.width/2 - Int(param.paddings[0])
-        let offsetY = param.filter.height/2 - Int(param.paddings[1])
-        
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        
-        print("offset x: \(offsetX)")
-        print("offset y: \(offsetY)")
-        
-        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        super.init(device: device, inFunctionName: "conv_add_5x1")
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x5")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_3x3")
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
     }
     
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+//    print(" function: \(functionName)")
+//    print("offset x: \(offsetX)")
+//    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+//    print("metal param: ")
+//    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..35d49953c656364799e8ca7400ef4bac445200a0
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e79f8f9be37c2575b28aef2e9169ab814c9587fe
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -0,0 +1,180 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+import MetalPerformanceShaders
+
+struct ConvBNReluTestParam: TestParam {
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  var metalParam: MetalConvParam
+  let filterBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
+  let filterSize: (width: Int, height: Int, channel: Int)
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+    
+    inputTexture = inInputTexture
+    outputTexture = inOutputTexture
+    metalParam = inMetalParam
+    filterBuffer = inFilterBuffer
+    biaseBuffer = inBiaseBuffer
+    newScaleBuffer = inNewScaleBuffer
+    newBiaseBuffer = inNewBiaseBuffer
+    filterSize = inFilterSize
+  }
+}
+
+class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
+  required init(device: MTLDevice, testParam: ConvBNReluTestParam) {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+    } else if testParam.filterSize.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+    }
+  }
+  
+  var metalParam: MetalConvParam!
+
+  required init(device: MTLDevice, param: ConvBNReluParam<P>) {
+    
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
+    
+    if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else {
+      fatalError()
+    }
+    
+   
+    
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    
+//    print(" param filter width: \(param.filter.width)")
+//    print(" param filter height: \(param.filter.height)")
+//
+//    print(" param paddings: \(param.paddings)")
+//
+//    print("ConvBNReluKernel offset x: \(offsetX)")
+//    print("ConvBNReluKernel offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
+    }
+    
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+    
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+      newScale[i] = invs[i] * scaleContents[i]
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    }
+    
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float32 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float16 {
+      
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
+    }
+    
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
+    
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
+    
+    newBiase.deinitialize(count: param.bias.buffer.length)
+    newBiase.deallocate()
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 2)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
+    }
+    
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
deleted file mode 100644
index 9d0c6de35ed23b14a05a9c3e6398931556d535a0..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
-};
-
-
-kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device half4 *weights [[buffer(1)]],
-                                         const device half4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    half4 output = half4(0.0);
-    
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device half4 *weights [[buffer(1)]],
-                                         const device half4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    half4 output = half4(0.0);
-    
-    half4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    half4 output = half4(0.0);
-    
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = output + biase[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device half *weights [[buffer(1)]],
-                                                   const device half4 *biase [[buffer(2)]],
-                                                   const device float4 *new_scale [[buffer(3)]],
-                                                   const device float4 *new_biase [[buffer(4)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    half4 output = half4(0.0);
-    half4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-
-/*---------------------------------------------*/
-
-
-
-kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = output + biase[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    float4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        float4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
index 92c43fe3218aa0c3ecfabd9a8d85c8107ecad273..345136a503d8eda6ad23f85ef01eb53fa539d453 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -14,38 +14,49 @@
 
 import Foundation
 
-
 public struct MetalConvParam {
-    let offsetX: Int16
-    let offsetY: Int16
-    let offsetZ: Int16
-    let strideX: UInt16
-    let strideY: UInt16
-    let paddedZ: UInt16
+  let offsetX: Int16
+  let offsetY: Int16
+  let offsetZ: Int16
+  let strideX: UInt16
+  let strideY: UInt16
+  let dilationX: UInt16
+  let dilationY: UInt16
 }
 
 class ConvKernel<P: PrecisionType>: Kernel, Computable {
-    var metalParam: MetalConvParam!
-    required init(device: MTLDevice, param: ConvParam<P>) {
-        super.init(device: device, inFunctionName: "conv_add_1x1")
-        let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
-        let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
-        let offsetZ = 0.0
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvParam<P>) {
+    param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
+    if param.filter.width == 1 && param.filter.height == 1 {
+      super.init(device: device, inFunctionName: "conv_1x1")
+    } else if param.filter.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_3x3")
+    } else if param.filter.width == 3 && param.filter.height == 3 {
+      super.init(device: device, inFunctionName: "conv_3x3")
+    } else {
+      fatalError(" unsupport ")
     }
+
+    let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
+    let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
+    let offsetZ = 0.0
     
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..435776c850854f2fc4259e8a2089299da825f463
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct MetalConvTransposeParam {
+  let kernelW: UInt16;
+  let kernelH: UInt16;
+  
+  let strideX: UInt16;
+  let strideY: UInt16;
+  
+  let paddingX: UInt16;
+  let paddingY: UInt16;
+  
+  let dilationX: UInt16;
+  let dilationY: UInt16;
+}
+
+class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: MetalConvTransposeParam!
+  required init(device: MTLDevice, param: ConvTransposeParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true)
+    if computePrecision == .Float32 {
+      if param.stride == [2, 2] && param.stride == [2, 2] {
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2")
+      } else {
+        fatalError(" -- conv transpose unsupported yet -- ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.stride == [2, 2] && param.stride == [2, 2] {
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half")
+      } else {
+        fatalError(" -- conv transpose unsupported yet -- ")
+      }
+    } else {
+      fatalError()
+    }
+    
+//    let filter: [Float32] = param.filter.buffer.array()
+//    print(" conv transpose filter")
+//    print(filter)
+    let kernelWidth = UInt16(param.filter.width)
+    let kernelHeight = UInt16(param.filter.height)
+    
+    let strideX = UInt16(param.stride[0])
+    let strideY = UInt16(param.stride[1])
+    let paddingX = UInt16(param.paddings[0])
+    let paddingY = UInt16(param.paddings[1])
+    let dilationX = UInt16(param.dilations[0])
+    let dilationY = UInt16(param.dilations[1])
+    
+    metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
index 361e77950841f2fa2b54884a2fbf394714f10902..16774a85492d2e21ca5575ed661674824319db28 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -14,13 +14,60 @@
 
 import Foundation
 
+struct ElementwiseAddMetalParam {
+  var fast: Int32 = 0
+  var axis: Int32 = 0
+  var ylen: Int32 = 0
+  var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+  var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+  var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+}
 
 class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
-    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
-        super.init(device: device, inFunctionName: "elementwise_add")
-    }
+  var metalParam: ElementwiseAddMetalParam
+  required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
     
-    func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
-        
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "elementwise_add")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "elementwise_add_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
+    encoder.setTexture(param.inputX.metalTexture, index: 0)
+    encoder.setTexture(param.inputY.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 2)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..91589864b07f10754c860d038e754e09874db54e
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: ElementwiseAddMetalParam
+  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+   
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+    
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
+    
+    if computePrecision == .Float32 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_float")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_element_float")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_prelu_float")
+      }
+    } else if computePrecision == .Float16 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      }
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.inputX.metalTexture, index: 0)
+    encoder.setTexture(param.inputY.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 2)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..090c55b16160dca19bfcdc4f3467cacdbc9a20c2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct FlattenMetalParam {
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
+}
+
+
+class FlattenKernel<P: PrecisionType>: Kernel, Computable{
+  
+  var metalParam: FlattenMetalParam
+  
+  required init(device: MTLDevice, param: FlattenParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    }
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = FlattenMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
+    assert(orank == 2)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernel.swift
deleted file mode 100644
index 8f97d61e83fc71efca8a4d41705b3eb56d7dbdb3..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernel.swift
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-import Metal
-import Foundation
-
-public protocol TestParam {
-}
-
-public protocol Testable {
-    associatedtype TestParamType: TestParam
-    func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
-    init(device: MTLDevice, testParam: TestParamType)
-}
-
-
-protocol Computable {
-    associatedtype ParamType: OpParam
-    func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
-    init(device: MTLDevice, param: ParamType)
-}
-
-protocol KernelProtocol {
-    var pipline: MTLComputePipelineState { get set }
-    var functionName: String { get set }
-   
-}
-
-open class Kernel {
-    let pipline: MTLComputePipelineState
-    let functionName: String
-    public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
-        pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
-        functionName = inFunctionName
-    }
-}
-
-open class CusomKernel: Kernel {
-    public struct Shape {
-        public let width: Int
-        public let height: Int
-        public let channel: Int
-        public init(inWidth: Int, inHeight: Int, inChannel: Int){
-            width = inWidth
-            height = inHeight
-            channel = inChannel
-        }
-    }
-    let outputTexture: MTLTexture
-    public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
-        let textureDesc = MTLTextureDescriptor.init()
-        textureDesc.textureType = .type2D
-        textureDesc.width = outputDim.width
-        textureDesc.height = outputDim.height
-        textureDesc.depth = (outputDim.channel + 3) / 4
-        textureDesc.pixelFormat = .rgba32Float
-        textureDesc.usage = [.shaderRead, .shaderWrite]
-        textureDesc.storageMode = .shared
-        outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-
-        super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
-    }
-    
-    func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(inputTexuture, index: 0)
-        encoder.setTexture(outputTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
-        encoder.endEncoding()
-    }
-    
-}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
deleted file mode 100644
index 92ee1184520d7b1df2577c1fc52cc3257de7be79..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#include <metal_stdlib>
-using namespace metal;
-
-struct OutputDim {
-    ushort width;
-    ushort height;
-    ushort strideX;
-    ushort strideY;
-};
-
-kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
-                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                   constant OutputDim &params [[buffer(0)]],
-                   uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
-    const half4 input = inTexture.read(pos);
-    outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
-}
-
-kernel void relu(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    const float4 relu = fmax((float4)input, 0.0);
-    outTexture.write(half4(relu), gid.xy, gid.z);
-}
-
-kernel void elementwise_add(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                            texture2d_array<half, access::write> outTexture [[texture(1)]],
-                            const device half4 *biasTerms [[buffer(0)]],
-                            uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    outTexture.write(input, gid.xy, gid.z);
-}
-
-kernel void batchnorm(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      const device half4 * newScale [[buffer(0)]],
-                      const device half4 * newBias [[buffer(1)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    const half4 input = inTexture.read(gid.xy, gid.z);
-    half4 output = input * newScale[gid.z] + newBias[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
-}
-
-//kernel void texture2d_to_2d_array(texture2d<half, access::read> inTexture [[texture(0)]],
-//                               texture2d_array<half, access::write> outTexture [[texture(1)]],
-//                               uint3 gid [[thread_position_in_grid]]) {
-//    if (gid.x >= inTexture.get_width() ||
-//        gid.y >= inTexture.get_height()){
-//        return;
-//    }
-//    const half4 input = inTexture.read(gid.xy);
-//    outTexture.write(input, gid.xy, 0);
-//}
-
-kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
-                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height()){
-        return;
-    }
-    const float4 input = inTexture.read(gid.xy);
-    outTexture.write(input, gid.xy, 0);
-}
-
-
-kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
-                                  texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                  uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height()){
-        return;
-    }
-    const half4 input = inTexture.read(gid.xy);
-    outTexture.write(input, gid.xy, 0);
-}
-
-struct PoolParam {
-    int ksizeX;
-    int ksizeY;
-    int strideX;
-    int strideY;
-    int paddingX;
-    int paddingY;
-    int poolType;
-};
-
-kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                 texture2d_array<float, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int xmin = gid.x * pm.strideX - pm.paddingX;
-    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-    xmin = max(xmin, 0);
-    int ymin = gid.y * pm.strideX - pm.paddingX;
-    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-    ymin = max(ymin, 0);
-    
-    float4 r = 0;
-    if (pm.poolType == 0) {
-        r = inTexture.read(uint2(xmin, ymin), gid.z);
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-            }
-        }
-    } else if (pm.poolType == 1) {
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r += inTexture.read(uint2(x, y), gid.z);
-            }
-        }
-        r /= pm.ksizeX * pm.ksizeY;
-    }
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-
-kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                 texture2d_array<half, access::write> outTexture [[texture(1)]],
-                 constant PoolParam &pm [[buffer(0)]],
-                 uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int xmin = gid.x * pm.strideX - pm.paddingX;
-    int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-    xmin = max(xmin, 0);
-    int ymin = gid.y * pm.strideX - pm.paddingX;
-    int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-    ymin = max(ymin, 0);
-    
-    half4 r = 0;
-    if (pm.poolType == 0) {
-        r = inTexture.read(uint2(xmin, ymin), gid.z);
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-            }
-        }
-    } else if (pm.poolType == 1) {
-        for (int x = xmin; x < xmax; x++) {
-            for (int y = ymin; y < ymax; y++) {
-                r += inTexture.read(uint2(x, y), gid.z);
-            }
-        }
-        r /= pm.ksizeX * pm.ksizeY;
-    }
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void reshape(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                    texture2d_array<float, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    float4 r = inTexture.read(uint2(0, 0), gid.z);
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                    texture2d_array<half, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    
-    half4 r = inTexture.read(uint2(0, 0), gid.z);
-    outTexture.write(r, gid.xy, gid.z);
-}
-
-kernel void softmax(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                    texture2d_array<float, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int zsize = inTexture.get_array_size();
-    float maxv = inTexture.read(uint2(0, 0), 0)[0];
-    for (int z = 0; z < zsize; z++) {
-        float4 r = inTexture.read(uint2(0, 0), z);
-        maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3])));
-    }
-    float sum = 0;
-    for (int z = 0; z < zsize; z++) {
-        float4 r = inTexture.read(uint2(0, 0), z);
-        sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv);
-    }
-    float4 rr = inTexture.read(gid.xy, gid.z);
-    rr = exp(rr - maxv) / sum;
-    outTexture.write(rr, gid.xy, gid.z);
-}
-
-
-kernel void softmax_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                    texture2d_array<half, access::write> outTexture [[texture(1)]],
-                    uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) return;
-    int zsize = inTexture.get_array_size();
-    half maxv = inTexture.read(uint2(0, 0), 0)[0];
-    for (int z = 0; z < zsize; z++) {
-        half4 r = inTexture.read(uint2(0, 0), z);
-        maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3])));
-    }
-    float sum = 0;
-    for (int z = 0; z < zsize; z++) {
-        half4 r = inTexture.read(uint2(0, 0), z);
-        sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv);
-    }
-    half4 rr = inTexture.read(gid.xy, gid.z);
-    rr = exp(rr - maxv) / sum;
-    outTexture.write(rr, gid.xy, gid.z);
-}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..3f78efb89e47197ae0af6a1bb53955bc4a937eda
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
+  let pipline1: MTLComputePipelineState
+
+  required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
+    
+    param.middleOutput.initBuffer(device: device)
+    param.bboxOutput.initBuffer(device: device)
+    if computePrecision == .Float32 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result")
+    } else if computePrecision == .Float16 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result_half")
+    } else {
+      fatalError( " unsupport precision " )
+    }
+    
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.scores.metalTexture, index: 0)
+    encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
+    encoder.endEncoding()
+    
+    guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
+    encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
+    encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
+    encoderBox.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
index 983a3acb9943f2e549b07d095c7dd4a23c1e96d9..1d66e420e236f2e0a7734838a293215807caa968 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -15,46 +15,57 @@
 import Foundation
 
 struct PoolMetalParam {
-    let ksizeX: Int32
-    let ksizeY: Int32
-    let strideX: Int32
-    let strideY: Int32
-    let paddingX: Int32
-    let paddingY: Int32
-    let poolType: Int32
+  let ksizeX: Int32
+  let ksizeY: Int32
+  let strideX: Int32
+  let strideY: Int32
+  let paddingX: Int32
+  let paddingY: Int32
+  let poolType: Int32
 }
 
 class PoolKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        var poolType: Int32
-        switch param.poolType {
-        case "max":
-            poolType = 0
-        case "avg":
-            poolType = 1
-        default:
-            throw PaddleMobileError.predictError(message: " unknown pooltype " + param.poolType)
-        }
-        var pmp = PoolMetalParam.init(
-            ksizeX: param.ksize[0],
-            ksizeY: param.ksize[1],
-            strideX: param.stride[0],
-            strideY: param.stride[1],
-            paddingX: param.padding[0],
-            paddingY: param.padding[1],
-            poolType: poolType
-        )
-        encoder.setBytes(&pmp, length: MemoryLayout<PoolMetalParam>.size, index: 0)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  var metalParam: PoolMetalParam
+  required init(device: MTLDevice, param: PoolParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    
+    var poolType: Int32
+    switch param.poolType {
+    case "max":
+      poolType = 0
+    case "avg":
+      poolType = 1
+    default:
+      fatalError()
     }
+    metalParam = PoolMetalParam.init(
+      ksizeX: param.ksize[0],
+      ksizeY: param.ksize[1],
+      strideX: param.stride[0],
+      strideY: param.stride[1],
+      paddingX: param.padding[0],
+      paddingY: param.padding[1],
+      poolType: poolType
+    )
     
-    required init(device: MTLDevice, param: PoolParam<P>) {
-        super.init(device: device, inFunctionName: "pool")
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "pool")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "pool_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
     }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4ee25888f06048bfe696028ea2338a56fd06053e
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PreluKernel<P: PrecisionType>: Kernel, Computable{
+  required init(device: MTLDevice, param: PreluParam<P>) {
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "prelu_channel")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "prelu_element")
+      } else {
+        super.init(device: device, inFunctionName: "prelu_other")
+      }
+    } else if computePrecision == .Float16 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "prelu_channel_half")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "prelu_element_half")
+      } else {
+        super.init(device: device, inFunctionName: "prelu_other_half")
+      }
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..be18c4411ffbef704dff61bb2aa82bc338daf163
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct PriorBoxMetalParam {
+  let offset: Float32
+  let stepWidth: Float32
+  let stepHeight: Float32
+  let minSize: Float32
+  let maxSize: Float32
+  let imageWidth: Float32
+  let imageHeight: Float32
+  let clip: Bool
+  let numPriors: uint
+  let aspecRatiosSize: uint
+  let minSizeSize: uint
+  let maxSizeSize: uint
+}
+
+class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: PriorBoxMetalParam!
+  
+  required init(device: MTLDevice, param: PriorBoxParam<P>) {
+    
+    let originDim = param.output.tensorDim;
+    
+    param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    
+    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
+    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+    
+    
+    if computePrecision == .Float32 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder")
+      } else {
+        super.init(device: device, inFunctionName: "prior_box")
+      }
+      
+    } else if computePrecision == .Float16 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half")
+      } else {
+        super.init(device: device, inFunctionName: "prior_box_half")
+      }
+    } else {
+      fatalError()
+    }
+    
+    
+    guard param.minSizes.count == 1 else {
+      fatalError(" need implement ")
+    }
+    
+//    let n = 1
+//    let h = param.output.dim[1]
+//    let w = param.output.dim[2]
+//    let c = param.output.dim[3] * param.output.dim[0]
+//
+//    param.output.dim = Dim.init(inDim: [n, h, w, c])
+//    param.output.transpose = [0, 1, 2, 3]
+    
+    let imageWidth = Float32(param.inputImage.padToFourDim[3])
+    let imageHeight = Float32(param.inputImage.padToFourDim[2])
+    
+    let featureWidth = param.input.padToFourDim[3]
+    let featureHeight = param.input.padToFourDim[2]
+    
+    if param.stepW == 0 || param.stepH == 0 {
+      param.stepW = Float32(imageWidth) / Float32(featureWidth)
+      param.stepH = Float32(imageHeight) / Float32(featureHeight)
+    }
+    
+    var outputAspectRatior: [Float32] = []
+    outputAspectRatior.append(1.0)
+    
+    let epsilon = 1e-6
+    for ar in param.aspectRatios {
+      var alreadyExist = false
+      for outputAr in outputAspectRatior {
+        if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
+          alreadyExist = true
+          break
+        }
+      }
+      
+      if !alreadyExist {
+        outputAspectRatior.append(ar)
+      }
+      if param.flip {
+        outputAspectRatior.append(1.0 / ar)
+      }
+    }
+    
+    if computePrecision == .Float16 {
+      let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
+      float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
+      param.newAspectRatios = buffer
+
+    } else if computePrecision == .Float32 {
+      let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
+      param.newAspectRatios = buffer
+    } else {
+      fatalError()
+    }
+    
+    let aspectRatiosSize = uint(outputAspectRatior.count)
+    
+    let maxSizeSize: uint = uint(param.maxSizes.count)
+    let minSizeSize: uint = uint(param.minSizes.count)
+    
+    let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
+    
+    let minSize = param.minSizes.last ?? 0.0
+    let maxSize = param.maxSizes.last ?? 0.0
+    
+    metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
+    
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setTexture(param.outputVariances.metalTexture, index: 2)
+    
+    encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
+    
+    encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
+    
+    encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
index 3c669cf4d965f7842070c4d38427f6d1d7440db5..18f279e9f3c5226d6eea5b5e6f0a42502173071e 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
@@ -15,17 +15,23 @@
 import Foundation
 
 class ReluKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
-    
-    required init(device: MTLDevice, param: ReluParam<P>) {
-        super.init(device: device, inFunctionName: "relu")
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ReluParam<P>) {
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "relu")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "relu_half")
+    } else {
+      fatalError()
     }
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
index 438c89e59eb7e9a2ef315997b9d8d1f3a44a5462..4114d3c3c62054235cd57fe37fe9cd83c5bb58cb 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -14,18 +14,84 @@
 
 import Foundation
 
+struct ReshapeMetalParam {
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
+}
+
+struct ReshapeTestParam: TestParam {
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  let param: ReshapeMetalParam
+}
+
 class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
-    required init(device: MTLDevice, param: ReshapeParam<P>) {
-        super.init(device: device, inFunctionName: "reshape")
+  
+  var metalParam: ReshapeMetalParam
+  
+  required init(device: MTLDevice, param: ReshapeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
     }
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
     }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = ReshapeMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  required init(device: MTLDevice, testParam: ReshapeTestParam) {
+    metalParam = ReshapeMetalParam.init(
+    idim: (0, 0, 0, 0),
+    itrans: (0, 0, 0, 0),
+    odim: (0, 0, 0, 0),
+    otrans: (0, 0, 0, 0)
+    )
+    super.init(device: device, inFunctionName: "reshape")
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+//  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      fatalError()
+//    }
+//    encoder.setTexture(testParam.inputTexture, index: 0)
+//    encoder.setTexture(testParam.outputTexture, index: 1)
+//    var pm: ReshapeMetalParam = testParam.param
+//    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+//    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+//    encoder.endEncoding()
+//  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e5cbce1d1e196f88bb7a3b38d3e92c330774f3ba
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ResizeBilinearMetalParam {
+  var ratio_h: Float32
+  var ratio_w: Float32
+}
+
+class ResizeBilinearKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
+    let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
+    var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ResizeBilinearParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "resize_bilinear")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "resize_bilinear_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift
deleted file mode 100644
index d2795111ad1f43c759b95aa52ed34085a4ac147a..0000000000000000000000000000000000000000
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-//
-//import Foundation
-//import MetalPerformanceShaders
-//
-//
-//struct ResizeParam: OpParam{
-//    typealias OutputType = <#type#>
-//    
-//    typealias ParamPrecisionType = <#type#>
-//    
-//    let input: MTLTexture
-//    let output: MTLTexture
-//    let expectDim: Dim
-//}
-//
-//struct OutputDim {
-//    let width: UInt16
-//    let height: UInt16
-//    let strideX: UInt16
-//    let strideY: UInt16
-//}
-//
-//class ResizeKernel<P: PrecisionType>: Kernel, Computable{
-//    var lanczos: MPSImageLanczosScale
-//    required init(device: MTLDevice, param: ResizeParam) {
-//        lanczos = MPSImageLanczosScale.init(device: device)
-//        super.init(device: device, inFunctionName: "resize")
-//    }
-//    func compute(commandBuffer: MTLCommandBuffer, param: ResizeParam) throws {
-////        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-////            throw PaddleMobileError.predictError(message: " encode is nil")
-////        }
-//        lanczos.encode(commandBuffer: commandBuffer, sourceTexture: param.input, destinationTexture: param.output)
-//        
-////        encoder.setTexture(param.input, index: 0)
-////        encoder.setTexture(param.output, index: 1)
-////        let strideX = param.input.width/param.expectDim[2]
-////        let strideY = param.input.height/param.expectDim[1]
-////        var outputDim = OutputDim.init(width: UInt16(param.expectDim[1]), height: UInt16(param.expectDim[2]), strideX: UInt16(strideX), strideY: UInt16(strideY))
-////        encoder.setBytes(&outputDim, length: MemoryLayout<OutputDim>.size, index: 0)
-////        encoder.dispatch(computePipline: pipline, outTexture: param.output)
-////        encoder.endEncoding()
-//    }
-//    
-//
-//    
-//    
-//}
-
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..feb052a44fdc7c6134cc90f07f3fc94ad0a497df
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ShapeMetalParam {
+}
+
+class ShapeKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
+//    print("shape compute")
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      throw PaddleMobileError.predictError(message: " encode is nil")
+//    }
+//    encoder.setTexture(param.output.metalTexture, index: 0)
+//    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ShapeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "shape")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "shape_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
index b94f0286f43ec482353ff278c6c104da77f47315..5d6874da151b64fd58c2016865515778d6267551 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -14,19 +14,38 @@
 
 import Foundation
 
+struct SoftmaxMetalParam {
+  let N: Int32
+  let K: Int32
+}
+
 class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
-    
-    func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encoder is nil")
-        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        encoder.endEncoding()
+  
+  var metalParam: SoftmaxMetalParam
+  required init(device: MTLDevice, param: SoftmaxParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    metalParam = SoftmaxMetalParam.init(
+      N: Int32(param.input.tensorDim[0]),
+      K: Int32(param.input.tensorDim[1])
+    )
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "softmax_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "softmax_half")
+    } else {
+      fatalError()
     }
-    
-    required init(device: MTLDevice, param: SoftmaxParam<P>) {
-        super.init(device: device, inFunctionName: "softmax")
+  }
+
+  func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
     }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..67e1cd9ab85c3c60d89846bab89ef10bbe513305
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct SplitMetalParam {
+  var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+  var axis: Int32 = 0
+  var offset: Int32 = 0
+  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+}
+
+class SplitKernel<P: PrecisionType>: Kernel, Computable{
+  var smp: SplitMetalParam
+  func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    for i in 0..<param.outputList.count {
+      encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+    }
+    encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: SplitParam<P>) {
+    //     param.output.initTexture(device: device, computePrecision: computePrecision)
+    let num = param.outputList.count
+    let rank = param.input.tensorDim.cout()
+    assert(num >= 2 && num <= 4)
+    for output in param.outputList {
+      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    }
+    smp = SplitMetalParam.init()
+    smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
+    smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
+    for i in 0..<4 {
+      if param.input.transpose[i] == smp.axis {
+        smp.axis = Int32(i)
+        break
+      }
+    }
+    smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
+    var vdim: [Int32] = [0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
+    }
+    smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
+    var v = "normal"
+    if rank == 4 {
+      if smp.axis == 1 {
+        v = "y"
+      } else if smp.axis == 2 {
+        v = "x"
+      }
+    } else if rank == 3 {
+      if smp.axis == 2 {
+        v = "y"
+      } else if smp.axis == 3 {
+        v = "x"
+      }
+    } else if rank == 2 {
+      if smp.axis == 2 {
+        v = "y"
+      }
+    }
+    if v == "normal" {
+      fatalError("split unsupported")
+    }
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
index b524c3ac80fac6fa98ac6c9d4e680fee1af4e46a..0943686660e4bdd91b6cd909dff04cdd497cd817 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -15,23 +15,31 @@
 import Foundation
 
 struct Texture2DTo2DArrayParam {
-    let input: MTLTexture
-    let output: MTLTexture
-    let expectDim: Dim
+  let input: MTLTexture
+  let output: MTLTexture
+  let expectDim: Dim
 }
 
 class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(param.input.mtlTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
-        encoder.endEncoding()
+  func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
     }
-    
-    required init(device: MTLDevice, param: FeedParam<P>) {
-        super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+    encoder.setTexture(param.input.mtlTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: FeedParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array_half")
+    } else if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+    } else {
+      fatalError()
     }
+    
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7b872283d45bca4adb5e90a531c936f2ad5534f8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct TransposeMetalParam {
+  var iC: Int32 = 0
+  var oC: Int32 = 0
+  var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+}
+
+class TransposeKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: TransposeMetalParam = TransposeMetalParam.init()
+  required init(device: MTLDevice, param: TransposeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    let rank = param.input.tensorDim.cout()
+    var axis: [Int] = [0, 1, 2, 3]
+    for i in 0..<param.axis.count {
+      axis[4-rank+i] = 4 - rank + Int(param.axis[i])
+    }
+
+    var naxis: [Int] = [0, 0, 0, 0]
+    for i in 0..<4 {
+      for j in 0..<4 {
+        if param.input.transpose[j] == axis[i] {
+          naxis[i] = j
+          break
+        }
+      }
+    }
+    metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
+    metalParam.oC = Int32(param.output.dim[3])
+    metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
+    var kernelFunc = "transpose_undefined"
+    if computePrecision == .Float16 {
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_half"
+      } else {
+        kernelFunc = "transpose_\(rank)_half"
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_float"
+      } else {
+        kernelFunc = "transpose_\(rank)_float"
+      }
+    } else {
+      fatalError()
+    }
+    print("===========>", kernelFunc)
+    print(metalParam)
+    super.init(device: device, inFunctionName: kernelFunc)
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+  
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..96333a07a9669ecb2b5bfe901d71be729e37b533
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      const device float4 * nscale [[buffer(0)]],
+                      const device float4 * nbias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  float4 output = input * nscale[gid.z] + nbias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      const device half4 * newScale [[buffer(0)]],
+                      const device half4 * newBias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  half4 output = input * newScale[gid.z] + newBias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
new file mode 100644
index 0000000000000000000000000000000000000000..eb94408c8ac664be5cf62bc28bfb02825856ebd4
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
@@ -0,0 +1,36 @@
+//
+//  BatchNormRelu.metal
+//  paddle-mobile
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct MetalConvParam {
+    short offsetX;
+    short offsetY;
+    short offsetZ;
+    ushort strideX;
+    ushort strideY;
+};
+
+kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         const device float4 *new_scale [[buffer(0)]],
+                                         const device float4 *new_biase [[buffer(1)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    float4 input;
+    float4 output;
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    input = inTexture.sample(sample, gid.x, gid.y, gid.z);
+    output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
+
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..a590f8089890f2fab1af4c1f736f3bfc5708aecf
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                     texture2d_array<P, access::write> output [[texture(1)]],
+                     constant bilinear_interp_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    P w = gid.x * pm.ratio_w;
+    P h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    P w1lambda = w - w0, h1lambda = h - h0;
+    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
+    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
+    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
+    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+      + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
new file mode 100644
index 0000000000000000000000000000000000000000..394cf89db09d47b0d3c87ff124c21a93962c0972
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct bilinear_interp_param {
+  float ratio_h;
+  float ratio_w;
+};
+
+#define P float
+#include "BilinearInterp.inc.metal"
+#undef P
+
+#define P half
+#include "BilinearInterp.inc.metal"
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..918fbac1a713d7b0442a1eb1f07abea3616bec96
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
+                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
+                     texture2d_array<P, access::read> targetBox [[texture(2)]],
+                     texture2d_array<P, access::write> output[[texture(3)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) t;
+  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
+  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
+  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
+  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
+  
+  P px = (p.x + p.z) / 2;
+  P py = (p.y + p.w) / 2;
+  P pw = p.z - p.x;
+  P ph = p.w - p.y;
+  
+  P tx = pv.x * t.x * pw + px;
+  P ty = pv.y * t.y * ph + py;
+  P tw = exp(pv.z * t.z) * pw;
+  P th = exp(pv.w * t.w) * ph;
+  
+  VECTOR(P, 4) r;
+  r.x = tx - tw / 2;
+  r.y = ty - th / 2;
+  r.z = tx + tw / 2;
+  r.w = ty + th / 2;
+
+  output.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4009e213d51d0a9c33c70aea22b015df49e347dc
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
@@ -0,0 +1,23 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+#define P float
+#include "BoxCoder.inc.metal"
+#undef P
+#define P half
+#include "BoxCoder.inc.metal"
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
new file mode 100644
index 0000000000000000000000000000000000000000..40bae035c097b5ab386d78520b6b04f074eb2fee
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
@@ -0,0 +1,120 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = abcd[2] = 0;
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = 0;
+  abcd[2] = xyzn[1];
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
+  abcd[0] = 0;
+  abcd[3] = xyzn[0];
+  abcd[2] = xyzn[1];
+  abcd[1] = xyzn[2] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+
+inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
+  xyzn[1] = xyzn[2] = 0;
+  xyzn[0] = abcd[3] / 4;
+  xyzn[1] = abcd[3] % 4;
+}
+inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
+  xyzn[2] = 0;
+  xyzn[1] = abcd[2];
+  xyzn[0] = abcd[3] / 4;
+  xyzn[3] = abcd[3] % 4;
+}
+inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[3];
+  xyzn[1] = abcd[2];
+  xyzn[2] = abcd[1] / 4;
+  xyzn[3] = abcd[1] % 4;
+}
+inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+
+inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+
+inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+
+inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
+  int32_t r = abcd[0];
+  r = r * dim[1] + abcd[1];
+  r = r * dim[2] + abcd[2];
+  r = r * dim[3] + abcd[3];
+  return r;
+}
+
+inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
+  abcd[3] = ind % dim[3]; ind /= dim[3];
+  abcd[2] = ind % dim[2]; ind /= dim[2];
+  abcd[1] = ind % dim[1]; ind /= dim[1];
+  abcd[0] = ind;
+}
+
+inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[i] = ipos[trans[i]];
+  }
+}
+
+inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[trans[i]] = ipos[i];
+  }
+}
+
+
+struct MetalConvParam {
+  short offsetX;
+  short offsetY;
+  short offsetZ;
+  ushort strideX;
+  ushort strideY;
+  ushort dilationX;
+  ushort dilationY;
+};
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..2b070fc48b78391e96b93823eeff7f936de2ff7d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
@@ -0,0 +1,318 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+
+#if V == VNORMAL
+//kernel void FUNC(concat, R, N, normal, P)(array<texture2d_array<P, access::read>, N> in [[texture(0)]],
+//                                     texture2d_array<P, access::read> out_x [[texture(N)]],
+//                                     texture2d_array<P, access::write> out [[texture(N+1)]],
+//                                     constant ConcatParam & pm [[buffer(0)]],
+//                                     uint3 gid [[thread_position_in_grid]]) {
+//}
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif
+                                          texture2d_array<P, access::read> inx [[texture(N)]],
+                                          texture2d_array<P, access::write> out [[texture(N+1)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+
+   ConcatParam cp = pm;
+   int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+   VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
+   for (int i = 0; i < 4; i++) {
+     xyzn[3] = i;
+#if R == 4
+     xyzn2abcd_4(cp.odim[3], xyzn, abcd);
+#else
+     FUNC_R(xyzn2abcd, R)(xyzn, abcd);
+#endif
+     int k = abcd[cp.axis] - cp.offset;
+     if (k < 0) continue;
+     int j = 0;
+     for (; j < N; j++) {
+       if (k < cp.vdim[j]) {
+         break;
+       }
+       k -= cp.vdim[j];
+     }
+     if (j == N) {
+       continue;
+     }
+     int ta = cp.odim[cp.axis];
+     abcd[cp.axis] = k;
+     cp.odim[cp.axis] = cp.vdim[j];
+#if R == 4
+     abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
+#else
+     FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
+#endif
+     cp.odim[cp.axis] = ta;
+     switch (j) {
+       case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+       case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#if N >= 3
+       case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 4
+       case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 5
+       case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 6
+       case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+     }
+   }
+   out.write(r, gid.xy, gid.z);
+}
+
+#endif // V == NORMAL
+
+
+
+#if V == VX
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                          texture2d_array<P, access::write> out [[texture(N)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+  int x = gid.x - pm.offset;
+  if (x < 0) return;
+  if (x < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= pm.vdim[0];
+  if (x < pm.vdim[1]) {
+    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  x -= pm.vdim[1];
+  if (x < pm.vdim[2]) {
+    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= pm.vdim[2];
+  if (x < pm.vdim[3]) {
+    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  x -= pm.vdim[3];
+  if (x < pm.vdim[4]) {
+    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  x -= pm.vdim[4];
+  if (x < pm.vdim[5]) {
+    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VX
+
+#if V == VY
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int y = gid.y - pm.offset;
+  if (y < 0) return;
+  if (y < pm.vdim[0]) {
+    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= pm.vdim[0];
+  if (y < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  y -= pm.vdim[1];
+  if (y < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= pm.vdim[2];
+  if (y < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  y -= pm.vdim[3];
+  if (y < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  y -= pm.vdim[4];
+  if (y < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VY
+
+#if V == VZ
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int z = gid.z - pm.offset;
+  if (z < 0) return;
+  if (z < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  z -= pm.vdim[0];
+  if (z < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  z -= pm.vdim[1];
+  if (z < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  z -= pm.vdim[2];
+  if (z < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  z -= pm.vdim[3];
+  if (z < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  z -= pm.vdim[4];
+  if (z < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VZ
+
+
+#undef VV
+#endif // #ifdef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b7d17f2d25de544e4ce938c577e0d04f536da9af
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
@@ -0,0 +1,171 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ConcatParam {
+  int32_t odim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[6];
+};
+
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+
+// >> fast mode
+// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half}
+// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half}
+// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half}
+// >> normal mode (loop mode)
+// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x)
+// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y)
+// genet: (R=4, N=2, V=normal)
+
+// ssd-ar: (R=3, N=5, V=x)
+#define V VX
+  #define R 3
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd-ar: (R=2, N=5, V=x)
+#define V VX
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd-ar: (R=4, N=3, V=z)
+#define V VZ
+  #define R 4
+    #define N 3
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+// ssd: (R=2, N=6, V=y)
+#define V VY
+  #define R 2
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd: (R=3, N=6, V=y)
+#define V VY
+  #define R 3
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+#define V VNORMAL
+  #define R 4
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+#define V VY
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..87b60a64fc48ab89af274e0b24897e0b411599e0
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
@@ -0,0 +1,310 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+
+kernel void conv_add_batch_norm_relu_1x1_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+
+
+/*---------------------------------------------*/
+
+
+
+kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device float *weights [[buffer(1)]],
+                                                   const device float4 *biase [[buffer(2)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
new file mode 100644
index 0000000000000000000000000000000000000000..274e416576743a473ba8931bcd538e9c39415f3c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
@@ -0,0 +1,622 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - convAdd
+kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 9;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  float4 input[9];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_y = param.dilationY;
+  float4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  float4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device float *weights [[buffer(1)]],
+                                   const device float4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = biase[gid.z];
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+#pragma mark - half
+
+kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device half *weights [[buffer(1)]],
+                                   const device half4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  half4 output = biase[gid.z];
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_y = param.dilationY;
+  half4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  half4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<float, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device float4 *weights [[buffer(1)]],
+                              const device float4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 9;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  float4 input[9];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  //  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..069daa20e875eb00c0d518e0463987248ca8dce5
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
@@ -0,0 +1,447 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#include "Macro.metal"
+
+
+#pragma mark - convAdd
+kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  VECTOR(P, 4) output = biase[gid.z];
+  
+  VECTOR(P, 4) input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
+    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  
+//  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device VECTOR(P, 4) *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+     const device P *alpha [[buffer(3)]],
+#endif
+     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 9;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];
+
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+
+  VECTOR(P, 4) input[9];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+
+    for (int j = 0; j < 9; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                        const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];;
+
+  ushort dilation_y = param.dilationY;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+
+kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];
+
+  ushort dilation_x = param.dilationX;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device P *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+    const device P *alpha [[buffer(3)]],
+#endif
+    uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  VECTOR(P, 4) output = biase[gid.z];
+  VECTOR(P, 4) inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    VECTOR(P, 4) input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+#endif
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..f03a1d5b625cf01f1f1bc5ac23bebf7dabd968d9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+#define P float
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4b97b7829a1fba27704fe7b60a03b2672f4f5953
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
@@ -0,0 +1,297 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - conv bn relu
+kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device float *weights [[buffer(1)]],
+                                               const device float4 *new_scale [[buffer(2)]],
+                                               const device float4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#pragma mark - half
+kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device half *weights [[buffer(1)]],
+                                               const device half4 *new_scale [[buffer(2)]],
+                                               const device half4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..c07515c13da54c7f8bf698f976e47f7cda6de32b
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -0,0 +1,280 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+// conv
+#pragma mark -- conv
+kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device float *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device half *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+    output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..baf3f31157a472412bb08ccb3c803f5ec9e25d9c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
@@ -0,0 +1,174 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct MetalConvTransposeParam{
+  ushort kernelW;
+  ushort kernelH;
+  
+  ushort strideX;
+  ushort strideY;
+  
+  ushort paddingX;
+  ushort paddingY;
+  
+  ushort dilationX;
+  ushort dilationY;
+};
+
+kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device float4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    
+    float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    
+    float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    
+    output.x += dot(input, kernel_slice0);
+    
+    output.y += dot(input, kernel_slice1);
+    
+    output.z += dot(input, kernel_slice2);
+    
+    output.w += dot(input, kernel_slice3);
+  }
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device half4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    
+    half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    
+    half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    
+    output.x += dot(float4(input), float4(kernel_slice0));
+    
+    output.y += dot(float4(input), float4(kernel_slice1));
+    
+    output.z += dot(float4(input), float4(kernel_slice2));
+    
+    output.w += dot(float4(input), float4(kernel_slice3));
+  }
+  
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+//                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+//                           constant MetalConvTransposeParam &param [[buffer(0)]],
+//                           const device float4 *weights [[buffer(1)]],
+//                           uint3 gid [[thread_position_in_grid]]){
+//  if (gid.x >= outTexture.get_width() ||
+//      gid.y >= outTexture.get_height() ||
+//      gid.z >= outTexture.get_array_size()) {
+//    return;
+//  }
+//
+//  int input_array_size = inTexture.get_array_size();
+//
+//  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
+//
+//  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+//
+//  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+//
+//  float4 output;
+//
+//  for (int w = 0; w < param.kernelW; ++w) {
+//    int top = gid.x - w * param.dilationX + param.paddingX;
+//    int input_x = top / param.strideX;
+//    if (top < 0 || input_x >= int(inTexture.get_width())) {
+//      continue;
+//    }
+//
+//    for (int h = 0; h < param.kernelH; ++h) {
+//      int top_y = gid.y - h * param.dilationY + param.paddingY;
+//      int input_y = top_y / param.strideY;
+//      if (top_y < 0 || input_y >= int(inTexture.get_height())) {
+//        continue;
+//      }
+//
+//      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
+//
+//      for (int slice = 0; slice < input_array_size; ++slice) {
+//
+//        float4 input;
+//        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
+//        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
+//
+//        input = inTexture.sample(sample, float2(input_x,    input_y), slice);
+//        output.x += dot(input, kernel_slice);
+//        output.y += dot(input, kernel_slice1);
+//        output.z += dot(input, kernel_slice2);
+//        output.w += dot(input, kernel_slice3);
+//      }
+//    }
+//  }
+//
+//  outTexture.write(output, gid.xy, gid.z);
+//}
+//
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b152df828106acd96171a89f4f636f308e0e9e39
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
@@ -0,0 +1,100 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+
+kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
+                            texture2d_array<float, access::read> inputY [[texture(1)]],
+                            texture2d_array<float, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  float4 rx, ry;
+
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  float4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
+                            texture2d_array<half, access::read> inputY [[texture(1)]],
+                            texture2d_array<half, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  half4 rx, ry;
+
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  half4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b1d68d680962c53778d624ab15bfcfeb1d1a3142
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#include <metal_stdlib>
+#include "Macro.metal"
+
+using namespace metal;
+
+kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
+                                 texture2d_array<P, access::read> inputY [[texture(1)]],
+                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
+                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+#ifdef PRELU_CHANNEL
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_ELEMENT
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_OTHER
+                                 const device P *alpha [[buffer(1)]],
+#endif
+                                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  VECTOR(P, 4) rx, ry;
+  
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+    } else {
+      rx = inputX.read(gid.xy, gid.z);
+      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+      int32_t yshift = 4 - pm.ylen - pm.axis;
+      for (int n = 0; n < 4; n++) {
+        x_xyzn[3] = n;
+        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+        invtrans(xtrans, x_abcd, t_abcd);
+        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+          y_abcd[yshift+k] = t_abcd[k];
+        }
+        trans(ytrans, y_abcd, t_abcd);
+        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+      }
+  }
+  VECTOR(P, 4) output = rx + ry;
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..8fd1a9fdab8c86fbc52f6dab9c448b7b0f27d403
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+
+#define P float
+
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+#define PRELU_CHANNEL channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b7d7028d46356e0dae21b352161de31b0820ff1a
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                       device float *output [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+}
+
+
+kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                  device float * output [[buffer(0)]],
+                  uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+  
+}
+
+kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+}
+
+kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
new file mode 100644
index 0000000000000000000000000000000000000000..368509f001aca6361b81b9b7839cf24b2efc5c12
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+// 占位函数, 啥也没干
+kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+}
+
+struct OutputDim {
+  ushort width;
+  ushort height;
+  ushort strideX;
+  ushort strideY;
+};
+
+kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
+                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                   constant OutputDim &params [[buffer(0)]],
+                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
+  const half4 input = inTexture.read(pos);
+  outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
+}
+
+
+kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
+                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const float4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
+
+kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const half4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
new file mode 100644
index 0000000000000000000000000000000000000000..950d7d5f0555b841da57554ff61f2f5cdbcae7aa
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..44c57440e1ec138717ad1bc569fd772e0d7ede1a
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+  
+}
+
+
+kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+}
+
+kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float4 *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input;
+}
+
+kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           device float4 *output [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = float4(input);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f7240db2ba716090001ed539bddb87dff5117
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct PoolParam {
+  int ksizeX;
+  int ksizeY;
+  int strideX;
+  int strideY;
+  int paddingX;
+  int paddingY;
+  int poolType;
+};
+
+kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                 constant PoolParam &pm [[buffer(0)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  float4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      constant PoolParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  half4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..597804137743dd253d05d91a5008f558dcaf42e7
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           const device float4 *alpha [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float4 alpha_value = alpha[gid.z];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  float4 alpha_value = alpha[alpha_to + gid.z];
+
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float alpha_value = alpha[0];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half4 alpha_value = alpha[gid.z];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  half4 alpha_value = alpha[alpha_to + gid.z];
+  
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                        const device half *alpha [[buffer(0)]],
+                        uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half alpha_value = alpha[0];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..7630febf77210bb364f0191e8b10a5a6923d6c95
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
@@ -0,0 +1,367 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct PriorBoxMetalParam {
+  float offset;
+  float stepWidth;
+  float stepHeight;
+  float minSize;
+  float maxSize;
+  float imageWidth;
+  float imageHeight;
+  
+  bool clip;
+  
+  uint numPriors;
+  uint aspecRatiosSize;
+  uint minSizeSize;
+  uint maxSizeSize;
+};
+
+kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    float ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(res, gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(max_box, gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+
+
+kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                      const device half *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    half ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+      
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+
+
+
+kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  
+  
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  
+
+  
+  
+  if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
+    
+    int skip = 0;
+    for (int i = 0; i < aspect_to + 1; ++i) {
+      if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
+        skip += 1;
+      }
+    }
+    aspect_to += skip;
+    
+    float ar = aspect_ratios[aspect_to];
+    
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+
+
+kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                           const device half *aspect_ratios [[buffer(0)]],
+                           constant PriorBoxMetalParam &param [[buffer(1)]],
+                           const device float4 *variances [[buffer(2)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  
+  
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  
+  if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
+    float ar = aspect_ratios[aspect_to];
+    
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..e725440bbe997d571f1860bce323516144a94da8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                 texture2d_array<half, access::write> outTexture [[texture(1)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(half4(relu), gid.xy, gid.z);
+}
+
+kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(float4(relu), gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..7583537c2b404b7a95eeedfb4c69793a608f18ac
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+
+#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant ReshapeParam &rp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
+  ReshapeParam lrp = rp;
+  int oC = lrp.odim[lrp.otrans[3]];
+  int iC = lrp.idim[lrp.itrans[3]];
+  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
+  VECTOR(P, 4) r;
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if ROUT == 4
+    xyzn2abcd_4(oC, oxyzn, oabcd);
+#else
+    FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
+#endif
+    int tabcd[4];
+    invtrans(lrp.otrans, oabcd, tabcd);
+    int index = abcd2index(lrp.odim, tabcd);
+    if (index < count) {
+      index2abcd(lrp.idim, index, tabcd);
+      trans(lrp.itrans, tabcd, iabcd);
+#if RIN == 4
+      abcd2xyzn_4(iC, iabcd, ixyzn);
+#else
+      FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
+#endif
+      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+    } else {
+      r[n] = 0;
+    }
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..d2f5815d422ec8c4f3e1e3c1992855547e002264
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONRITIONS OF ANY KINR, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ReshapeParam {
+  int32_t idim[4];
+  int32_t itrans[4];
+  int32_t odim[4];
+  int32_t otrans[4];
+};
+
+#define P float
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#undef P
+
+#define P half
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
new file mode 100644
index 0000000000000000000000000000000000000000..fbb4e12cb82c12f8dc5b94c397e43b8c8c5ae518
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct resize_bilinear_param {
+//  int32_t out_h;
+//  int32_t out_w;
+  float ratio_h;
+  float ratio_w;
+};
+
+kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
+                     texture2d_array<float, access::write> output [[texture(2)]],
+                     constant resize_bilinear_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  float4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    float w = gid.x * pm.ratio_w;
+    float h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    float w1lambda = w - w0, h1lambda = h - h0;
+    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    float4 r0 = input.read(uint2(w0, h0), gid.z);
+    float4 r1 = input.read(uint2(w1, h0), gid.z);
+    float4 r2 = input.read(uint2(w0, h1), gid.z);
+    float4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+
+kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
+                            texture2d_array<half, access::write> output [[texture(2)]],
+                            constant resize_bilinear_param & pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  
+  half4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    half w = gid.x * pm.ratio_w;
+    half h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    half w1lambda = w - w0, h1lambda = h - h0;
+    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    half4 r0 = input.read(uint2(w0, h0), gid.z);
+    half4 r1 = input.read(uint2(w1, h0), gid.z);
+    half4 r2 = input.read(uint2(w0, h1), gid.z);
+    half4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+  output.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b50d5547193ccc9a1bef1b3ed6bbd1b7a64c3527
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
@@ -0,0 +1,21 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void shape() {
+}
+kernel void shape_half() {
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..455cf1471b5c369fc27040e03b57812e8d6bf0e8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant SoftmaxParam &sp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+//  int zsize = inTexture.get_array_size();
+  P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
+  int group = sp.K / 4;
+  int remain = sp.K % 4;
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
+  }
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      maxv = max(maxv, r[i]);
+    }
+  }
+  VECTOR(P, 4) rsum = {0, 0, 0, 0};
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    rsum += exp(r - maxv);
+  }
+  P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      sum += exp(r[i] - maxv);
+    }
+  }
+  VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
+  rr = exp(rr - maxv) / sum;
+  outTexture.write(rr, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
new file mode 100644
index 0000000000000000000000000000000000000000..67c279a4441095e710985c65d85aac589b7d0f54
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct SoftmaxParam {
+  int N;
+  int K;
+};
+
+#define P float
+#include "Softmax.inc.metal"
+#undef P
+
+#define P half
+#include "Softmax.inc.metal"
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..54e3f21e793a9c1474f13fed61857211cb7d117f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
@@ -0,0 +1,122 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+
+#if V == VY
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                 texture2d_array<P, access::write> out1 [[texture(1)]],
+                                 texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                 texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                 texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                 constant SplitParam &sp [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int y = gid.y - sp.offset;
+  if (y < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= sp.vdim[0];
+  if (y < sp.vdim[1]) {
+    out2.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#if N >= 3
+  y -= sp.vdim[1];
+  if (y < sp.vdim[2]) {
+    out3.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= sp.vdim[2];
+  if (y < sp.vdim[3]) {
+    out4.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VY
+
+
+#if V == VX
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                     texture2d_array<P, access::write> out1 [[texture(1)]],
+                                     texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                     texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                     texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                     constant SplitParam &sp [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int x = gid.x;
+  if (x < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= sp.vdim[0];
+  if (x < sp.vdim[1]) {
+    out2.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#if N >= 3
+  x -= sp.vdim[1];
+  if (x < sp.vdim[2]) {
+    out3.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= sp.vdim[2];
+  if (x < sp.vdim[3]) {
+    out4.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VX
+
+
+
+#undef VV
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4c1e818d2bf5c7266169f406fbfaf8e322685dc4
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct SplitParam {
+  int32_t idim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[4];
+};
+
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+
+// only support split_{2, 3, 4}_{2, 3, 4}_y_{float, half}
+// only support split_{3, 4}_{2, 3, 4}_x_{float, half}
+
+
+//// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+//// ssd-ar: (R=2, N=2, V=y)
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..534166e45fc3db49cc5de526ec0d5179ca3f9899
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+
+#define FUNC(f, r, p) CONCAT3_(f, r, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                      texture2d_array<P, access::write> outTexture [[texture(1)]],
+                      constant TransposeParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
+  int iabcd[4], oabcd[4], ixyzn[4];
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if R == 4
+    xyzn2abcd_4(pm.oC, oxyzn, iabcd);
+#endif // R == 4
+#if R == 3
+    xyzn2abcd_3(oxyzn, oabcd);
+#endif // R == 3
+#if R == 2
+    xyzn2abcd_2(oxyzn, oabcd);
+#endif // R == 2
+    iabcd[pm.axis[0]] = oabcd[0];
+    iabcd[pm.axis[1]] = oabcd[1];
+    iabcd[pm.axis[2]] = oabcd[2];
+    iabcd[pm.axis[3]] = oabcd[3];
+#if R == 4
+    abcd2xyzn_4(pm.iC, iabcd, ixyzn);
+#endif // R == 4
+#if R == 3
+    abcd2xyzn_3(iabcd, ixyzn);
+#endif // R == 3
+#if R == 2
+    abcd2xyzn_2(iabcd, ixyzn);
+#endif // R == 2
+    r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..321663b9b7f09eba2041cb0932215d291e44aba6
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct TransposeParam {
+  int iC;
+  int oC;
+  int axis[4];
+};
+
+kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+
+#define R 4
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+
+#define R 3
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+
+#define R 2
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..fc1b3164c9cf623a1bc4d350cc8a5f72c369bae4
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
+      bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
+      output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      
+      middleOutput = FetchHolder.init(inCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim.dims)
+      
+      bboxOutput = FetchHolder.init(inCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim.dims)
+    } catch let error {
+      throw error
+    }
+  }
+  var bboxOutput: FetchHolder
+  var middleOutput: FetchHolder
+  let scores: Texture<P>
+  let bboxes: Texture<P>
+  var output: Texture<P>
+}
+
+class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
+
+  func inputVariant() -> [String : [Variant]] {
+    return ["Scores" : [para.middleOutput], "BBoxes" : [para.bboxOutput]]
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let _ {
+      fatalError()
+    }
+  }
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  typealias OpType =  MulticlassNMSOp<P>
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+
+  }
+  
+  func delogOutput() {
+    print(" nms - output: ")
+    print(para.bboxes.metalTexture.float32Array().strideArray())
+  }
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
index 07676defe71ec18560df4be630cd04008cd1aad6..6f42f2aa9f8d0515946ace625ed16c5040fd3099 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
@@ -15,54 +15,60 @@
 import Foundation
 
 class PoolParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
-            poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
-            ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
-            stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
-            padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
-            ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
-            globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
-        } catch let error {
-            throw error
-        }
-//        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
+      ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
+      stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
+      globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
+      assert(input.transpose == [0, 2, 3, 1])
+    } catch let error {
+      throw error
     }
-    let input: Texture<P>
-    var output: Texture<P>
-    var ksize: [Int32]
-    var stride: [Int32]
-    var padding: [Int32]
-    var poolType: String
-    var ceilMode: Bool
-    var globalPooling: Bool
+    //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  var ksize: [Int32]
+  var stride: [Int32]
+  var padding: [Int32]
+  var poolType: String
+  var ceilMode: Bool
+  var globalPooling: Bool
 }
 
 class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    typealias OpType = PoolOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+  
+  typealias OpType = PoolOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+
     
-    func delogOutput() {
-        print("pool2d delog")
-        let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
-        print(para.ksize)
-        print(para.stride)
-        print(para.padding)
-        print(para.poolType)
-        let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
-    }
+//    print("pool2d delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
+//    print(para.ksize)
+//    print(para.stride)
+//    print(para.padding)
+//    print(para.poolType)
+//    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..2d7987e937b9ddf6410ebb0d23bb89c76c1a13ce
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let mode: String
+  let alpha: Tensor<P>
+  let input: Texture<P>
+  var output: Texture<P>
+}
+
+class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = PreluOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) input: ")
+    print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
+    
+    print(" \(type) Alpha: ")
+    let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
+    
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+  
+//    print("softmax delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
+//    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..2a9f18463483a024545300661e1db33cedce585b
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -0,0 +1,124 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PriorBoxParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
+    } catch _ {
+    }
+    
+    do {
+      input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
+      inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
+      outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
+      minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
+      maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
+      aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
+      variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
+      flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
+      clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
+      stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
+      stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
+      offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  var min_max_aspect_ratios_order: Bool = false
+  let minSizes: [Float32]
+  let maxSizes: [Float32]
+  let aspectRatios: [Float32]
+  var newAspectRatios: MTLBuffer?
+  let variances: [Float32]
+  let flip: Bool
+  let clip: Bool
+  var stepW: Float32
+  var stepH: Float32
+  let offset: Float32
+  
+  let input: Texture<P>
+  let inputImage: Texture<P>
+  var output: Texture<P>
+  let outputVariances: Texture<P>
+}
+
+class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = PriorBoxOp<P>
+
+  func inferShape() {
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+
+    print(" \(type) output: ")
+    // output
+//    let outputArray = para.output.metalTexture.float32Array()
+//    print(outputArray.strideArray())
+//    let device = para.input.metalTexture!.device
+//    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
+//    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
+//    print("boxes: ")
+//    print(boxes.strideArray())
+//    print("variances: ")
+//    print(variances.strideArray())
+    // output
+    print(" \(type) output: ")
+    
+    let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3]))
+    print(" dim: \(para.output.dim)")
+    print(box.strideArray())
+//    print((0..<box.count).map { (index: $0, value: box[$0])})
+//    print(para.output.realNHWC().strideArray())
+    
+//    let padToFourDim = para.output.padToFourDim
+//    if para.output.transpose == [0, 1, 2, 3] {
+//      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
+//      print(outputArray.strideArray())
+//    } else if para.output.transpose == [0, 2, 3, 1] {
+//      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
+//    } else {
+//      print(" not implement")
+//    }
+    
+//    writeToLibrary(fileName: "box_out", array: outputArray)
+    
+    // output variance
+//    let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in
+//      return o
+//    }
+//
+//    print(" output variance: \(outputVarianceArray)")
+    
+//    writeToLibrary(fileName: "variance_out", array: outputVarianceArray)
+    
+  }
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
index f65e402cdd2b6356199a2104f99556cd4fdd3b6a..7748df75fef3a2280a51dda159ead0392e146443 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
@@ -1,47 +1,58 @@
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// 
-// http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
 
 import Foundation
 
 class ReluParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
     }
-    let input: Texture<P>
-    var output: Texture<P>
+  }
+  let input: Texture<P>
+  var output: Texture<P>
 }
 
 class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        para.output.dim = para.input.dim
-    }
-    
-    typealias OpType = ReluOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
+  
+  typealias OpType = ReluOp<P>
+  
+  func inferShape() {
+    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
 }
 
 
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
index 759ffd4b8b46673e5245f8bbc67dbcc0956666aa..ac46baca91bd6eedab9241da68a05d08391ec931 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -15,36 +15,63 @@
 import Foundation
 
 class ReshapeParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
+        
+      var s: [Int] = shape.map { Int($0) }
+      
+      var di = -1
+      var ml = 1
+      for i in 0..<s.count {
+        if s[i] == -1 {
+          di = i
+          continue
         }
+        ml *= s[i]
+      }
+      if di >= 0 {
+        s[di] = input.dim.numel() / ml
+      }
+      output.tensorDim = Dim.init(inDim: s)
+      var dim: [Int] = [1, 1, 1, 1]
+      for i in 0..<s.count {
+        dim[4-s.count+i] = s[i]
+      }
+      output.padToFourDim = Dim.init(inDim: dim)
+      output.dim = output.padToFourDim
+    } catch let error {
+      throw error
     }
-    let input: Texture<P>
-    var output: Texture<P>
+  }
+  let input: Texture<P>
+  let shape: [Int32]
+  var output: Texture<P>
 }
 
 class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
-    }
-    
-    typealias OpType = ReshapeOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-    }
-    func delogOutput() {
-        print("reshape delog")
-        let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false)
-        let _: P? = para.output.metalTexture.logDesc(header: "reshape output: ", stridable: false)
+  
+  typealias OpType = ReshapeOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  func delogOutput() {
+    print("reshape delog")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+//    print(outputArray)
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e0e699cdb8b3a17eb109877f1a7bd986b5e07403
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
@@ -0,0 +1,64 @@
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class ResizeBilinearParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
+//      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+//        fatalError()
+//      }
+      output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+      out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let out_h: Int32
+  let out_w: Int32
+}
+
+class ResizeBilinearOp<P: PrecisionType>: Operator<ResizeBilinearKernel<P>, ResizeBilinearParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ResizeBilinearOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..b37eed0a9d398923bb866444cf224cb79bb2fecc
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ShapeParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  var output: Texture<P>
+  let input: Texture<P>
+}
+
+class ShapeOp<P: PrecisionType>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ShapeOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
index d323b21cfa7729876a78702d0098c267132b4ab1..66b5c7b3146d4c433e12b846a971e4b5ae579f79 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
@@ -15,36 +15,48 @@
 import Foundation
 
 class SoftmaxParam<P: PrecisionType>: OpParam {
-    typealias ParamPrecisionType = P
-    required init(opDesc: OpDesc, inScope: Scope) throws {
-        do {
-            input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
-            output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
-        } catch let error {
-            throw error
-        }
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      
+      assert(input.tensorDim.dims.count == 2)
+      assert(input.transpose == [0, 1, 2, 3])
+      
+      output.dim = input.dim
+      output.tensorDim = input.tensorDim
+      output.padToFourDim = input.padToFourDim
+    } catch let error {
+      throw error
     }
-    let input: Texture<P>
-    var output: Texture<P>
+  }
+  let input: Texture<P>
+  var output: Texture<P>
 }
 
 class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, Runable, Creator, InferShaperable{
-    
-    func inferShape() {
-        // para.output.dim = para.input.dim
+  typealias OpType = SoftmaxOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
     }
+  }
+  
+  func delogOutput() {
+    print("softmax delog")
+    print(para.input)
     
-    typealias OpType = SoftmaxOp<P>
-    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
-        } catch let error {
-            throw error
-        }
-    }
-    func delogOutput() {
-        print("softmax delog")
-        let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
-        let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
-    }
+    print(para.output)
+    let padToFourDim = para.output.padToFourDim
+    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+    print(outputArray.strideArray())
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4495902a46426e2a866ba81a2aa761951605f940
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class SplitParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = Texture<P>.init(device: input.metalTexture!.device, inDim: input.dim)
+      axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
+      sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
+      if axis < 0 {
+        axis = input.tensorDim.cout() + axis
+      }
+      guard let outlist = opDesc.outputs["Out"] else {
+        fatalError()
+      }
+      for out in outlist {
+        guard let variant = inScope[out], let v = variant as? Texture<P> else {
+          fatalError()
+        }
+        outputList.append(v)
+        sections.append(Int32(v.tensorDim.dims[axis]))
+      }
+    } catch let error {
+      throw error
+    }
+  }
+  
+  var axis: Int
+  let input: Texture<P>
+  var output: Texture<P>
+  var outputList: [Texture<P>] = []
+  var sections: [Int32] = []
+}
+
+class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = SplitOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.input.metalTexture!.device
+    for out in para.outputList {
+      let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
+      print(arr.strideArray())
+    }
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8b695ec76fcd46b46f503e21e70f8aac52cee717
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class TransposeParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let axis: [Int32]
+}
+
+class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = TransposeOp<P>
+
+  func inferShape() {
+    //para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..00149053dfe6891f07f816feef524db35474a18b
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+
+typedef enum : NSUInteger {
+  MobileNetType,
+  MobileNetSSDType,
+  GenetType,
+} NetType;
+
+@interface PaddleMobileGPUResult: NSObject
+
+@property (assign, nonatomic) float *output;
+
+@property (assign, nonatomic) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface ModelConfig: NSObject
+
+/*
+ * 预处理需要用到的值 (三个)
+ */
+@property (strong, nonatomic) NSArray<NSNumber *> *means;
+/*
+ * 预处理需要用到的 scale 值
+ */
+@property (assign, nonatomic) float scale;
+
+/*
+ * 输出维度信息  [n c h w]
+ */
+@property (strong, nonatomic) NSArray<NSNumber *> *dims;
+
+
+/*
+ * 模型参数内存地址
+ */
+@property (assign, nonatomic) void *paramPointer;
+
+/*
+ * 模型参数占用内存大小 (kb)
+ */
+@property (assign, nonatomic) int paramSize;
+
+/*
+ * 模型内存地址
+ */
+@property (assign, nonatomic) void *modelPointer;
+
+/*
+ * 模型占用内存大小 (kb)
+ */
+@property (assign, nonatomic) int modelSize;
+
+@end
+
+@interface PaddleMobileGPU: NSObject
+
+/*
+ * 初始化
+ */
+-(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config;
+
+/*
+ * paramPointer 模型参数内存地址
+ * paramSize    模型参数占用内存大小 (kb)
+ * modelPointer 模型内存地址
+ * modelSize    模型占用内存大小 (kb)
+ */
+-(BOOL)load;
+
+/*
+ * texture:     需要进行预测的图像转换的 texture
+ * completion:  预测完成回调
+ */
+-(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSNumber *> *))completion;
+
+/*
+ * texture:     需要进行预测的图像转换的 texture
+ * completion:  预测完成回调
+ */
+-(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion;
+
+/*
+ * 清理内存
+ */
+-(void)clear;
+
+@end
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
new file mode 100644
index 0000000000000000000000000000000000000000..4e56bf2f98db9cda0d36587bef576e90b3ee6553
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import <Foundation/Foundation.h>
+#import "PaddleMobileGPU.h"
+#import "paddle_mobile.h"
+#import <paddle_mobile/paddle_mobile-Swift.h>
+
+@implementation ModelConfig
+@end
+
+@interface PaddleMobileGPUResult ()
+
+@property (strong, nonatomic) ResultHolder *resultHolder;
+
+- (void)setOutputResult:(ResultHolder *)resultHolder;
+
+@end
+
+@implementation PaddleMobileGPUResult
+- (void)setOutputResult:(ResultHolder *)resultHolder {
+  self.resultHolder = resultHolder;
+  self.output = resultHolder.result;
+  self.outputSize = resultHolder.capacity;
+}
+
+-(void)releaseOutput {
+  [self.resultHolder releasePointer];
+}
+@end
+
+@interface PaddleMobileGPU ()
+{
+  Runner *runner;
+}
+@end
+
+@implementation PaddleMobileGPU
+
+-(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config {
+  self = [super init];
+  if (self) {
+    Net *net = nil;
+    if (netType == GenetType) {
+      net = [[Genet alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
+    } else if (netType == MobileNetSSDType) {
+      net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
+    } else if (netType == MobileNetType) {
+      
+    }
+    runner = [[Runner alloc] initInNet:net commandQueue:queue inPlatform:PlatformGPU];
+  }
+  return self;
+}
+
+-(BOOL)load {
+  return [runner load];
+}
+
+-(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSNumber *> *))completion {
+  [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) {
+    NSMutableArray<NSNumber *> *resultArray = [NSMutableArray arrayWithCapacity:result.capacity];
+    for (int i = 0; i < result.capacity; ++i) {
+      [resultArray addObject:[NSNumber numberWithFloat:result.result[i]]];
+    }
+    completion(success, resultArray);
+    [result releasePointer];
+    
+  }];
+}
+
+-(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion {
+  [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) {
+    PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
+    [gpuResult setOutputResult:result];
+    completion(success, gpuResult);
+  }];
+}
+
+-(void)clear {
+  [runner clear];
+}
+
+@end
diff --git a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
index 8e1915a4975d5e444c2a5c0d0ee9e19d3cbe7577..98dd7ff39a71cadfe6cc33f3d468448ac5155242 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
@@ -14,7 +14,7 @@
 
 import Foundation
 
-struct BlockDesc {
+class BlockDesc {
     let index: Int
     let parentIndex: Int
     let vars: [VarDesc]
@@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
     var description: String {
         var str = ""
         
-        for op in ops {
-            str += op.description
+        for i in 0..<ops.count {
+          str += " op \(i): "
+          let op = ops[i]
+          str += op.description
         }
         
         for varDesc in vars {
diff --git a/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
index 73f81152316ad6812f705979b9c2358ee03eb3c8..44fc09a29db0deec67e7682b303b1d0947b47a51 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
@@ -14,68 +14,68 @@
 
 import Foundation
 
-struct OpDesc {
-    let inputs: [String : [String]]
-    var paraInputs: [String : [String]]
-    var outputs: [String : [String]]
-    let unusedOutputs: [String : [String]]
-    var attrs: [String : Attr] = [:]
-    var type: String
-    init(protoOpDesc: PaddleMobile_Framework_Proto_OpDesc) {
-        type = protoOpDesc.type
-        let creator = { (vars: [PaddleMobile_Framework_Proto_OpDesc.Var], canAdd: (String) -> Bool) -> [String : [String]] in
-            var map: [String : [String]] = [:]
-            for opDescVar  in vars {
-                if (canAdd(opDescVar.parameter)) {
-                    map[opDescVar.parameter] = opDescVar.arguments
-                }
-            }
-            return map
-        }
-        
-        inputs = creator(protoOpDesc.inputs) {
-            opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false
-        }
-        
-        paraInputs = creator(protoOpDesc.inputs) {
-            !(opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false)
-        }
-        
-        outputs = creator(protoOpDesc.outputs) {
-            opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false
-        }
-        
-        unusedOutputs = creator(protoOpDesc.outputs) {
-            !(opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false)
-        }
-        
-        for attr in protoOpDesc.attrs {
-            if (attr.type != .block) {
-                attrs[attr.name] = attrWithProtoDesc(attrDesc: attr)
-            }
+class OpDesc {
+  let inputs: [String : [String]]
+  var paraInputs: [String : [String]]
+  var outputs: [String : [String]]
+  let unusedOutputs: [String : [String]]
+  var attrs: [String : Attr] = [:]
+  var type: String
+  init(protoOpDesc: PaddleMobile_Framework_Proto_OpDesc) {
+    type = protoOpDesc.type
+    let creator = { (vars: [PaddleMobile_Framework_Proto_OpDesc.Var], canAdd: (String) -> Bool) -> [String : [String]] in
+      var map: [String : [String]] = [:]
+      for opDescVar  in vars {
+        if (canAdd(opDescVar.parameter)) {
+          map[opDescVar.parameter] = opDescVar.arguments
         }
+      }
+      return map
     }
-}
-
-extension OpDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    var description: String {
-        var str = ""
-        str += "op type: \(type): \n"
-        str += "    op inputs: \n"
-        str += "        \(inputs) \n"
-        str += "    op para inputs: \n"
-        str += "        \(paraInputs) \n"
-        str += "    op para outputs: \n"
-        str += "        \(outputs) \n"
-        str += "    op attrs: \n"
-        str += "        \(attrs) \n"
-        
-        return str
+    
+    inputs = creator(protoOpDesc.inputs) {
+      opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false
     }
     
-    var debugDescription: String {
-        return description
+    paraInputs = creator(protoOpDesc.inputs) {
+      !(opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false)
     }
     
+    outputs = creator(protoOpDesc.outputs) {
+      opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false
+    }
+    
+    unusedOutputs = creator(protoOpDesc.outputs) {
+      !(opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false)
+    }
+    
+    for attr in protoOpDesc.attrs {
+      if (attr.type != .block) {
+        attrs[attr.name] = attrWithProtoDesc(attrDesc: attr)
+      }
+    }
+  }
+}
+
+extension OpDesc: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    var str = ""
+    str += "op type: \(type): \n"
+    str += "    op inputs: \n"
+    str += "        \(inputs) \n"
+    str += "    op para inputs: \n"
+    str += "        \(paraInputs) \n"
+    str += "    op para outputs: \n"
+    str += "        \(outputs) \n"
+    str += "    op attrs: \n"
+    str += "        \(attrs) \n"
     
+    return str
+  }
+  
+  var debugDescription: String {
+    return description
+  }
+  
+  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Program/Program.swift b/metal/paddle-mobile/paddle-mobile/Program/Program.swift
index 1481677b198f802cd5f29a967513b2df2107bc47..464705d6db2b87945029de1bfcebddb1bfb4d092 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/Program.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/Program.swift
@@ -14,7 +14,7 @@
 
 import Foundation
 
-public struct Program {
+public class Program {
     let paramPath: String
     let programDesc: ProgramDesc
     let scope: Scope
@@ -23,4 +23,9 @@ public struct Program {
         paramPath = inParamPath
         scope = inScope
     }
+    init(inProgramDesc: ProgramDesc, inScope: Scope) {
+        programDesc = inProgramDesc
+        scope = inScope
+        paramPath = ""
+    }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
index ef094a8a20790b4e0cf47eaea04bb7d4f7a2d046..ad472e5a7d1fe9db248e47f4417d7c61fb01eaa9 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
@@ -14,7 +14,7 @@
 
 import Foundation
 
-public struct ProgramDesc {
+public class ProgramDesc {
     var blocks: [BlockDesc] = []
     init(protoProgram: PaddleMobile_Framework_Proto_ProgramDesc) {
         for block in protoProgram.blocks {
diff --git a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
index d819cdad533e444c327e95baff7bf87e902d6bff..87aced32c0c2cd576f023eeb5a3daad15daf1ce8 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
@@ -15,204 +15,285 @@
 import Foundation
 
 precedencegroup ChainNode {
-    associativity: left
-    higherThan: MultiplicationPrecedence
+  associativity: left
+  higherThan: MultiplicationPrecedence
 }
 
 infix operator --> : ChainNode
 
 class Node {
-    var inputs: [Node] = []
-    var outputs: [Node] = []
-    var type: String
-    var opDesc: OpDesc?
-    init(inOpDesc: OpDesc) {
-        type = inOpDesc.type
-        opDesc = inOpDesc
+  var inputs: [Node] = []
+  var outputs: [Node] = []
+  var type: String
+  var opDesc: OpDesc?
+  init(inOpDesc: OpDesc) {
+    type = inOpDesc.type
+    opDesc = inOpDesc
+  }
+  
+  init(inType: String) {
+    type = inType
+  }
+  
+  subscript(index: Int) -> [Node] {
+    var nodes: [Node] = []
+    getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
+    return nodes
+  }
+  
+  func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
+    if index == nowIndex {
+      nodes.append(self)
     }
     
-    init(inType: String) {
-        type = inType
+    for output in outputs {
+      output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+    }
+  }
+  
+  static func -->(lNode: Node, rNode: Node) -> Node {
+    lNode.outputs.append(rNode)
+    rNode.inputs.append(lNode)
+    return rNode
+  }
+  
+  func depth(begin: UInt = 1) -> UInt {
+    var beginMax: UInt = 1
+    for output in outputs {
+      let subDepth = output.depth(begin: begin + 1)
+      beginMax = max(begin, subDepth)
+    }
+    beginMax = max(begin, beginMax)
+    return beginMax
+  }
+  
+  func to(depth: UInt) -> Node {
+    let beginNode = Node.init(inType: type)
+    beginNode.opDesc = opDesc
+    to(depth: depth - 1, withNode: beginNode)
+    return beginNode
+  }
+  
+  func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
+    let fusionNode = fusion.fusionNode()
+    let change = fusion.change()
+    let inOutputs = outputs
+    outputs.removeAll()
+    opDesc?.outputs.removeAll()
+    for i in 0..<inOutputs.count {
+      inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+    }
+    opDesc?.type = fusion.fusionType()
+    type = fusion.fusionType()
+  }
+  
+  private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
+    guard let inOpdesc = opDesc else {
+      fatalError()
     }
     
-    static func -->(lNode: Node, rNode: Node) -> Node {
-        lNode.outputs.append(rNode)
-        rNode.inputs.append(lNode)
-        return rNode
+    for attr in inOpdesc.attrs {
+      beginNode.opDesc?.attrs[attr.key] = attr.value
+      //            print(beginNode.opDesc?.attrs)
     }
     
-    func depth(begin: UInt = 1) -> UInt {
-        var beginMax: UInt = 1
-        for output in outputs {
-            let subDepth = output.depth(begin: begin + 1)
-            beginMax = max(begin, subDepth)
+    for paraInput in inOpdesc.paraInputs {
+      if let inChanges = change[type] {
+        for keyChange in inChanges {
+          if keyChange.from == paraInput.key {
+            beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
+          } else {
+            beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+          }
         }
-        beginMax = max(begin, beginMax)
-        return beginMax
+      } else {
+        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+      }
     }
     
-    func to(depth: UInt) -> Node {
-        let beginNode = Node.init(inType: type)
-        to(depth: depth - 1, withNode: beginNode)
-        return beginNode
+    if matchNode.outputs.count == 0 {
+      beginNode.outputs.append(contentsOf: outputs)
+      beginNode.opDesc?.outputs = inOpdesc.outputs
+      
     }
+    removedNodes.append(self)
     
-    func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
-        let fusionNode = fusion.fusionNode()
-        let change = fusion.change()
-        let inOutputs = outputs
-        outputs.removeAll()
-        opDesc?.outputs.removeAll()
-        for i in 0..<inOutputs.count {
-            inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        opDesc?.type = fusion.fusionType()
-        type = fusion.fusionType()
+    for i in 0..<matchNode.outputs.count {
+      outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
     }
     
-    private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
-        guard let inOpdesc = opDesc else {
-            fatalError()
-        }
-        
-        for attr in inOpdesc.attrs {
-            beginNode.opDesc?.attrs[attr.key] = attr.value
-//            print(beginNode.opDesc?.attrs)
-        }
-        
-        for paraInput in inOpdesc.paraInputs {
-            if let inChanges = change[type] {
-                for keyChange in inChanges {
-                    if keyChange.from == paraInput.key {
-                        beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
-                    } else {
-                        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-                    }
-                }
-            } else {
-                beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
-            }
-        }
-        
-        if matchNode.outputs.count == 0 {
-            beginNode.outputs.append(contentsOf: outputs)
-            beginNode.opDesc?.outputs = inOpdesc.outputs
-            
-        }
-        removedNodes.append(self)
-        
-        for i in 0..<matchNode.outputs.count {
-            outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
-        }
-        
+  }
+  
+  private func to(depth: UInt, withNode: Node) {
+    if depth < 1 {
+      return
     }
     
-    private func to(depth: UInt, withNode: Node) {
-        if depth < 1 {
-            return
-        }
-        
-        for output in outputs {
-            let node = Node.init(inType: output.type)
-            withNode.outputs.append(node)
-            output.to(depth: depth - 1, withNode: node)
-        }
+    for output in outputs {
+      let node = Node.init(inType: output.type)
+      node.opDesc = output.opDesc
+      withNode.outputs.append(node)
+      output.to(depth: depth - 1, withNode: node)
+    }
+  }
+  
+  func relationship() -> [String : Node]{
+    var map: [String : Node] = [:]
+    relationship(map: &map)
+    return map
+  }
+  
+  private func relationship(map: inout [String : Node]) {
+    guard let inOpDesc = opDesc else {
+      return
     }
     
+    for output in inOpDesc.outputs {
+      for outputKey in output.value {
+        map[outputKey] = self
+      }
+    }
     
+    for output in outputs {
+      output.relationship(map: &map)
+    }
+  }
+  
 }
 
 extension Node: Equatable {
-    static func == (lhs: Node, rhs: Node) -> Bool {
-        if lhs.outputs.count != rhs.outputs.count {
-            return false
-        }
-        
-        if lhs.type != rhs.type {
-            return false
-        }
-        
-        for i in 0..<lhs.outputs.count {
-            if lhs.outputs[i] != rhs.outputs[i] {
-                return false
-            }
-        }
-        return true
+  static func == (lhs: Node, rhs: Node) -> Bool {
+    if lhs.outputs.count != rhs.outputs.count {
+      return false
     }
     
+    if lhs.type != rhs.type {
+      return false
+    }
+    
+    for i in 0..<lhs.outputs.count {
+      if lhs.outputs[i] != rhs.outputs[i] {
+        return false
+      }
+    }
+    return true
+  }
+  
 }
 
 class ProgramOptimize<P: PrecisionType> {
-    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self, ConvAddOp<P>.self]
-    func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
-        
-        guard originProgramDesc.blocks.count == 1 else {
-            fatalError(" not support yet")
+  // register fusion
+  let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+//                                  ConvAddAddPreluOp<P>.self,
+                                  ConvAddPreluOp<P>.self,
+                                  ConvAddOp<P>.self,
+                                  ConvBNReluOp<P>.self,
+                                  DwConvBNReluOp<P>.self,
+                                  ElementwiseAddPreluOp<P>.self
+  ]
+  
+  func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
+    
+    guard originProgramDesc.blocks.count == 1 else {
+      fatalError(" not support yet")
+    }
+    
+    var mapForNodeChain: [String : Node] = [:]
+    var nodes: [Node] = []
+    var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
+    let block = originProgramDesc.blocks[0]
+    for opDesc in block.ops {
+      guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
+        fatalError()
+      }
+      
+      let node = Node.init(inOpDesc: opDesc)
+      for inputKey in opInputKeys {
+        if let inputs = opDesc.inputs[inputKey] {
+          for input in inputs {
+            if let inputNode = mapForNodeChain[input] {
+              _ = inputNode --> node
+            }
+          }
         }
-        
-        var mapForNodeChain: [String : Node] = [:]
-        var nodes: [Node] = []
-        var typeMapNodes: [String : [Node]] = [:]
-        let block = originProgramDesc.blocks[0]
-            for opDesc in block.ops {
-                guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
-                    fatalError()
-                }
-                
-                let node = Node.init(inOpDesc: opDesc)
-                for inputKey in opInputKeys {
-                    if let inputs = opDesc.inputs[inputKey] {
-                        for input in inputs {
-                            if let inputNode = mapForNodeChain[input] {
-                                _ = inputNode --> node
-                            }
-                        }
+      }
+      
+      for outputKey in outputKeys {
+        if let outputs = opDesc.outputs[outputKey] {
+          for output in outputs {
+            mapForNodeChain[output] = node
+          }
+        }
+      }
+      
+      nodes.append(node)
+      
+      if var inNodes = typeMapNodes[opDesc.type] {
+        inNodes.append((node, mapForNodeChain))
+        typeMapNodes[opDesc.type] = inNodes
+      } else {
+        typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
+      }
+    }
+    
+    for fusion in fusionOps {
+      let fusionNode = fusion.fusionNode()
+      let depth = fusionNode.depth()
+      if let toMatchNodes = typeMapNodes[fusionNode.type] {
+        for node in toMatchNodes {
+          
+          let toNode = node.node.to(depth: depth)
+          if toNode == fusionNode {   // match
+            var canFolder = true
+            let relationshipMap = toNode.relationship()
+            
+            for toCheck in fusion.needCheck() {
+              //              let nodes = toCheck
+              let checkNodes = toNode[toCheck.0]
+              
+              for checkNode in checkNodes {
+                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
+                for inputToCheck in inputToChecks {
+                  if node.output[inputToCheck] == nil {
+                    if relationshipMap[inputToCheck] == nil {
+                      canFolder = false
                     }
+                  }
                 }
                 
-                for outputKey in outputKeys {
-                    if let outputs = opDesc.outputs[outputKey] {
-                        for output in outputs {
-                            mapForNodeChain[output] = node
-                        }
+                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
+                for paramInputToCheck in paramInputToChecks {
+                  if node.output[paramInputToCheck] == nil {
+                    if relationshipMap[paramInputToCheck] == nil {
+                      canFolder = false
                     }
+                  }
                 }
-                
-                nodes.append(node)
-                
-                if var inNodes = typeMapNodes[opDesc.type] {
-                    inNodes.append(node)
-                    typeMapNodes[opDesc.type] = inNodes
-                } else {
-                    typeMapNodes[opDesc.type] = [node]
-                }
+              }
             }
             
-            for fusion in fusionOps {
-                let fusionNode = fusion.fusionNode()
-                let depth = fusionNode.depth()
-                if let toMatchNodes = typeMapNodes[fusionNode.type] {
-                    for node in toMatchNodes {
-                        let toNode = node.to(depth: depth)
-                        if toNode == fusionNode {   // match
-                            var removeNodes: [Node] = []
-                            node.folderWith(fusion: fusion, removedNodes: &removeNodes)
-                            for removeNode in removeNodes {
-                                nodes.remove(element: removeNode)
-                            }
-                        }
-                    }
-                }
+            if !canFolder {
+              continue
             }
-        
-        var ops: [OpDesc] = []
-        for node in nodes {
-            ops.append(node.opDesc!)
+            
+            var removeNodes: [Node] = []
+            node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
+            for removeNode in removeNodes {
+              nodes.remove(element: removeNode)
+            }
+          }
         }
-        
-        var newProgramDesc = ProgramDesc.init()
-        let newBlock = BlockDesc.init(inVars: block.vars, inOps: ops)
-        newProgramDesc.blocks.append(newBlock)
-        return newProgramDesc
+      }
     }
+    
+    var ops: [OpDesc] = []
+    for node in nodes {
+      ops.append(node.opDesc!)
+    }
+    
+    var newProgramDesc = ProgramDesc.init()
+    let newBlock = BlockDesc.init(inVars: block.vars, inOps: ops)
+    newProgramDesc.blocks.append(newBlock)
+    return newProgramDesc
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
index e564821ab6a68fc96f00aeb10f3b2fba26d9600e..1a72f5ef717063136c4708c881befd789a57219c 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
@@ -14,18 +14,18 @@
 
 import Foundation
 
-struct TensorDesc {
+class TensorDesc {
     let dims: [Int]
     let dataType: VarTypeType
-    let dataLayout: DataLayout = .NCHW
+    let dataLayout: DataLayout = DataLayout.NCHW()
     var NCHWDim: [Int] {
         get {
             if dims.count != 4 {
                 return dims
             }
-            if dataLayout == .NCHW {
+            if dataLayout == DataLayout.NCHW() {
                 return dims
-            } else if dataLayout == .NHWC{
+            } else if dataLayout == DataLayout.NHWC() {
                 var resultDims = dims
                 resultDims.swapAt(1, 3)
                 return resultDims
@@ -40,9 +40,9 @@ struct TensorDesc {
             if dims.count != 4 {
                 return dims
             }
-            if dataLayout == .NHWC {
+            if dataLayout == DataLayout.NHWC() {
                 return dims
-            } else if dataLayout == .NCHW{
+            } else if dataLayout == DataLayout.NCHW() {
                 var resultDims = dims
                 resultDims.swapAt(1, 3)
                 return resultDims
@@ -53,7 +53,7 @@ struct TensorDesc {
     }
     
     init(protoTensorDesc: PaddleMobile_Framework_Proto_VarType.TensorDesc) {
-        dims = protoTensorDesc.dims.map{ Int($0) > 0 ? Int($0) : 1 }
+        dims = protoTensorDesc.dims.map{ Int($0) > 0 ? Int($0) : abs(Int($0)) }
         dataType = VarTypeType.init(rawValue: protoTensorDesc.dataType.rawValue) ?? .ErrorType
     }
     
diff --git a/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
index 58411828c0c94316da089fc1e2442c87bd154594..f29169598f69ec568bd9d08af8fa4738fe8f5eea 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
@@ -56,7 +56,7 @@ enum VarTypeType: Int {
     }
 }
 
-struct VarDesc {
+class VarDesc {
     let name: String
     let persistable: Bool
     let type: VarTypeType
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Dim.swift b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
index 672484cd9d055bbe65a61d41017199dd79d6cdb2..7e4a05a8dcfc17be10f183de36575342383bb560 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
@@ -31,15 +31,14 @@ public struct Dim {
         return dims.reduce(1) { $0 * $1 }
     }
     
-    static func ==(left: Dim, right: Dim) -> Bool {
+    public static func ==(left: Dim, right: Dim) -> Bool {
         return left.dims == right.dims;
     }
     
-    subscript(index: Int) -> Int {
+    public subscript(index: Int) -> Int {
         return dims[index];
     }
     
-    
     private(set) var dims: [Int]
     private init(){
         fatalError()
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
new file mode 100644
index 0000000000000000000000000000000000000000..bdaf8d0973ad3fa6c70e04ad84fd1b14bcb8b39a
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
@@ -0,0 +1,201 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+let testTo = 81
+
+var isTest = false
+
+let computePrecision: ComputePrecision = .Float16
+
+public class GPUResultHolder {
+  public let dim: [Int]
+  public let capacity: Int
+  public var resultPointer: UnsafeMutablePointer<Float32>?
+  public var intermediateResults: [String : [Variant]]?
+  public let elapsedTime: Double
+  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
+    dim = inDim
+    capacity = inCapacity
+    
+    if let inInPointer = inPointer {
+      resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
+      resultPointer?.initialize(from: inInPointer, count: inCapacity)
+    }
+    
+    elapsedTime = inElapsedTime
+    intermediateResults = inIntermediateResults
+  }
+  
+}
+
+extension GPUResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
+  public var debugDescription: String {
+//    var str = ""
+//    str += "Dim: \(dim) \n value:[ "
+//    if resultArr.count < 20 {
+//      for d in resultArr {
+//        str += " \(d) "
+//      }
+//    } else {
+//      for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
+//        str += " \(resultArr[d]) "
+//      }
+//    }
+//    str += " ]"
+//    return str
+    fatalError()
+  }
+  
+  public var description: String {
+    return debugDescription
+  }
+}
+
+public class Executor<P: PrecisionType> {
+  var ops: [Runable & InferShaperable] = []
+  let program: Program
+  let device: MTLDevice
+  let inflightSemaphore: DispatchSemaphore
+  let queue: MTLCommandQueue
+  public init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program) throws {
+    self.inflightSemaphore = DispatchSemaphore(value: 3)
+    program = inProgram
+    device = inDevice
+    queue = inQueue
+//    print("before for ")
+//print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+    
+    
+    for block in inProgram.programDesc.blocks {
+      //block.ops.count
+      for i in 0..<block.ops.count {
+        let opDesc = block.ops[i]
+        do {
+//          print("in for i \(i): ")
+//      print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+//
+//          if i == 56 {
+//          print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+//
+//          }
+          
+          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope)
+          ops.append(op)
+        } catch let error {
+          throw error
+        }
+      }
+    }
+  }
+  
+  public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (GPUResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+    guard let buffer = queue.makeCommandBuffer() else {
+      throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
+    }
+    inflightSemaphore.wait()
+    
+    let resInput: MTLTexture
+    if let inPre = preProcessKernle {
+      do {
+        try inPre.compute(inputTexuture: input, commandBuffer: buffer)
+        resInput = inPre.outputTexture
+      } catch let error {
+        throw error
+      }
+    } else {
+      resInput = input
+    }
+    
+    let beforeDate = Date.init()
+    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: dim))
+    program.scope.setInput(input: inputTexture)
+    //(ops.count - except)
+    for i in 0..<(ops.count - except) {
+      let op = ops[i]
+      do {
+        try op.run(device: device, buffer: buffer)
+      } catch let error {
+        throw error
+      }
+    }
+    
+    var outputTextures: [String : [Variant]]?
+    if except > 0 {
+      ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
+      outputTextures = ops[ops.count - except].inputVariant()
+    }
+    
+    buffer.addCompletedHandler { [weak self] (commandbuffer) in
+//      let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+//      print(inputArr.strideArray())
+//
+////      print(dim)
+//      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
+//      print(" write done ")
+
+//      print("write to library done")
+//      return
+//                  print(inputArr)
+//
+//                  let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
+//                  print(stridableInput)
+//
+//                  let _: Flo? = input.logDesc(header: "input: ", stridable: true)
+//      for i in 0..<self!.ops.count {
+//        let op = self!.ops[i]
+//        print(" 第 \(i) 个 op: ")
+//        op.delogOutput()
+//      }
+      
+//      return;
+//      self!.ops[testTo - 2].delogOutput()
+//      self!.ops[testTo - 1].delogOutput()
+//      self!.ops[5].delogOutput()
+
+//      return
+      
+      guard let SSelf = self else {
+//        return
+        fatalError()
+      }
+      
+      let afterDate = Date.init()
+      var resultHolder: GPUResultHolder
+      if except > 0 {
+        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
+      } else {
+        let outputVar: Variant = SSelf.program.scope.output()!
+        let output: FetchHolder = outputVar as! FetchHolder
+//        let beforeToTensorDate = Date.init()
+
+        resultHolder = GPUResultHolder.init(inDim: output.dim, inPointer: output.result, inCapacity: output.capacity, inElapsedTime: afterDate.timeIntervalSince(beforeDate))
+        
+//        let timeToTensor = Date.init().timeIntervalSince(beforeToTensorDate)
+//        print(timeToTensor)
+      }
+
+      completionHandle(resultHolder)
+      SSelf.inflightSemaphore.signal()
+    }
+    buffer.commit()
+  }
+  
+  public func clear() {
+    program.scope.clear()
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ee640ddf1163bb1f41da49fe9089964321792d9f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
@@ -0,0 +1,259 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+import SwiftProtobuf
+
+public class Loader<P: PrecisionType> {
+  class ParaLoader {
+    let file: UnsafeMutablePointer<FILE>
+    let fileSize: Int
+    var nowIndex: Int
+    init(paramPath: String) throws {
+      guard let tmpFile = fopen(paramPath, "rb") else {
+        throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+      }
+      file = tmpFile
+      fseek(file, 0, SEEK_END)
+      fileSize = ftell(file)
+      guard fileSize > 0 else {
+        throw PaddleMobileError.loaderError(message: "param file size is too small")
+      }
+      rewind(file)
+      nowIndex = 0
+    }
+    
+    func read(tensor: Tensor<P>) throws {
+      guard nowIndex <= fileSize else {
+        throw PaddleMobileError.loaderError(message: "out of the file range")
+      }
+      
+      func pointerReader<T>(type: T.Type) -> T {
+        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+        fread(ptr, 1, MemoryLayout<T>.size, file)
+        nowIndex += MemoryLayout<T>.size
+        let pointee = ptr.pointee
+        ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+        ptr.deallocate()
+        return pointee
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      let lodLevel = pointerReader(type: UInt64.self)
+      for _ in 0..<lodLevel {
+        let size = pointerReader(type: UInt64.self)
+        for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+          _ = pointerReader(type: size_t.self)
+        }
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      
+      let tensorDescSize = pointerReader(type: Int32.self)
+      
+      fseek(file, Int(tensorDescSize), SEEK_CUR)
+      nowIndex += Int(tensorDescSize)
+      
+      /*
+       这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
+       */
+      
+      //现在模型传入模型为  Float 类型, 这块应该根据模型来
+      //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+      //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+      let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
+      
+      guard bytesRead == tensor.data.size else {
+        throw PaddleMobileError.loaderError(message: "param read size error")
+      }
+      
+      // TODO: use script to convert
+      //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+      //            for i in 0..<tensor.numel() {
+      //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+      //            }
+      //            tmpPointer.deinitialize(count: tmpCapacity)
+      //            tmpPointer.deallocate()
+      
+      nowIndex += bytesRead
+    }
+    
+    deinit {
+      fclose(file)
+    }
+  }
+  class ParaLoaderWithPointer {
+    var paramPointer: UnsafeMutableRawPointer
+      let paramSize: Int
+      var nowIndex: Int
+      init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
+          paramPointer = UnsafeMutableRawPointer.init(pPointer)
+          paramSize = pSize
+          nowIndex = 0
+      }
+    
+      func read(tensor: Tensor<P>) throws {
+        guard nowIndex <= paramSize else {
+          throw PaddleMobileError.loaderError(message: "out of the file range")
+        }
+        var readerIndex: Int = 0
+        func pointerReader<T>(type: T.Type) -> T {
+          let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+          memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout<T>.size)
+          nowIndex += MemoryLayout<T>.size
+          readerIndex += MemoryLayout<T>.size
+          let pointee = ptr.pointee
+          ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+          ptr.deallocate()
+          
+          return pointee
+        }
+        let _ = pointerReader(type: UInt32.self)
+        let lodLevel = pointerReader(type: UInt64.self)
+        for _ in 0..<lodLevel {
+          let size = pointerReader(type: UInt64.self)
+          for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+            _ = pointerReader(type: size_t.self)
+          }
+        }
+        
+        let _ = pointerReader(type: UInt32.self)
+        let tensorDescSize = pointerReader(type: Int32.self)
+        
+        paramPointer = paramPointer.advanced(by: Int(readerIndex))
+        paramPointer = paramPointer.advanced(by: Int(tensorDescSize))
+        nowIndex += Int(tensorDescSize)
+        
+        let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size)
+        paramPointer = paramPointer.advanced(by: Int(tensor.data.size))
+        nowIndex += tensor.data.size
+    }
+    deinit {
+    }
+  }
+  public init(){}
+  func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program {
+    do {
+      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
+        serializedData: modelData)
+      
+      let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
+      let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
+      print(programDesc)
+      
+      guard programDesc.blocks.count > 0 else {
+        throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
+      }
+      
+      // to get feed key and fetch key
+      let block = programDesc.blocks[0]
+      guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
+        throw PaddleMobileError.loaderError(message: "at least two operator")
+      }
+      
+      guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
+        throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
+      }
+      
+      guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
+        throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
+      }
+      guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
+        throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
+      }
+      
+      let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
+      
+      // to load memory
+      for block in programDesc.blocks {
+        for varDesc in block.vars {
+          if (varDesc.type == .LodTensor) {
+            guard let tensorDesc = varDesc.tensorDesc else {
+              throw PaddleMobileError.loaderError(message: "get tensor desc failed")
+            }
+            
+            if (varDesc.persistable
+              && varDesc.type != .FeedMiniBatch
+              && varDesc.type != .FetchList) {
+              let dimArr = tensorDesc.dims
+              
+              guard dimArr.count > 0 else {
+                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
+              }
+              
+              let dim = Dim.init(inDim: dimArr)
+              let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+              do {
+                if paraLoaderPointer != nil {
+                  try paraLoaderPointer!.read(tensor: tensor)
+                }
+                
+                if paraLoader != nil {
+                  try paraLoader!.read(tensor: tensor)
+                }
+              } catch let error {
+                throw error
+              }
+              //              tensor.convert(to: DataLayout.NHWC())
+              //                            tensor.initBuffer(device: device)
+              scope[varDesc.name] = tensor
+            } else {
+              let dim = Dim.init(inDim: tensorDesc.dims)
+              scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
+            }
+          } else {
+            if varDesc.name == fetchKey {
+//              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
+            } else if varDesc.name == feedKey {
+            }
+          }
+        }
+      }
+      
+      let program = Program.init(inProgramDesc: programDesc, inScope: scope)
+      
+      return program
+    } catch _ {
+      throw PaddleMobileError.loaderError(message: "protobuf decoder error")
+    }
+  }
+  public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program {
+    let modelData = Data.init(bytes:modePointer, count:modelSize)
+    guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
+      throw PaddleMobileError.loaderError(message: "load para error")
+    }
+    do {
+      let program = try loadModelandParam(device,modelData,paraLoader,nil)
+      return program
+    } catch let error {
+      throw error
+    }
+  }
+    
+  public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
+    guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
+      throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+    }
+    guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
+      throw PaddleMobileError.loaderError(message: "load para error")
+    }
+    
+    do {
+      let program = try loadModelandParam(device,modelData,nil,paraLoader)
+      return program
+    } catch let error {
+      throw error
+    }
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
index 7ffcd97f4418f17cd7085c5d03e8b58b45c623fd..c5ee1414521e7eb92011d4f4b608ad326b005531 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -12,251 +12,308 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-import Accelerate
 import Foundation
 
 protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
-    var dim: Dim { get set }
-    func numel() -> Int
-    var layout: DataLayout { get }
+  var dim: Dim { get set }
+  func numel() -> Int
+  var layout: DataLayout { get }
 }
 
 extension Tensorial {
-    func numel() -> Int {
-        return dim.numel()
-    }
+  func numel() -> Int {
+    return dim.numel()
+  }
+}
+
+public enum ComputePrecision {
+  case Float32, Float16
 }
 
 class Tensor<P: PrecisionType>: Tensorial {
-    enum BufferPrecision {
-        case Float32, Float16
+  
+  var data: Data
+  var dim: Dim
+  var buffer: MTLBuffer!
+  private(set) var layout: DataLayout
+  
+  class Data {
+    init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
+      size = inSize
+      pointer = inPointer
+    }
+    let size: Int
+    var pointer: UnsafeMutablePointer<P>
+    subscript(index: Int) -> P{
+      get {
+        return pointer[index]
+      }
+      set {
+        pointer[index] = newValue
+      }
+    }
+    func release() {
+      pointer.deinitialize(count: size)
+      pointer.deallocate()
+    }
+    deinit {
+      //            release()
+    }
+  }
+  
+  required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+    dim = inDim
+    let size = inDim.numel() * MemoryLayout<P>.size
+    let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
+    data = Data.init(inSize: size, inPointer: pointer)
+    layout = inLayout
+  }
+  
+  func convert(to: DataLayout) {
+    guard to != layout else {
+      return
     }
     
-    var data: Data
-    var dim: Dim
-    var buffer: MTLBuffer!
-    private(set) var layout: DataLayout
+    guard dim.cout() == 4 else {
+      return
+    }
     
-    class Data {
-        init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
-            size = inSize
-            pointer = inPointer
-        }
-        let size: Int
-        var pointer: UnsafeMutablePointer<P>
-        subscript(index: Int) -> P{
-            get {
-                return pointer[index]
-            }
-            set {
-                pointer[index] = newValue
-            }
-        }
-        func release() {
-            pointer.deinitialize(count: size)
-            pointer.deallocate()
-        }
-        deinit {
-//            release()
-        }
+    guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
+      // other not support
+      return
     }
- 
-    required init(inDim: Dim, inLayout: DataLayout = .NCHW) {
-        dim = inDim
-        let size = inDim.numel() * MemoryLayout<P>.size
-        let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
-        data = Data.init(inSize: size, inPointer: pointer)
-        layout = inLayout
+    let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
+    
+    if layout == DataLayout.NCHW() {
+      NCHW2NHWC(newPtr: newPointer)
     }
     
-    func convert(to: DataLayout) {
-        guard to != layout else {
-            return
-        }
-        
-        guard dim.cout() == 4 else {
-            return
-        }
-        
-        guard layout == .NCHW && to == .NHWC else {
-            // other not support
-            return
-        }
-        let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
-        
-        if layout == .NCHW {
-            NCHW2NHWC(newPtr: newPointer)
-        }
-        
-        data.release()
-        data.pointer = newPointer
-        layout = to
+    data.release()
+    data.pointer = newPointer
+    layout = to
+  }
+  
+
+  
+  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) {
+    if convertToNHWC {
+//      print(layout)
+      convert(to: DataLayout.NHWC())
     }
     
-    func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
-        var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
-        var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
-        guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
-            fatalError(" float 32 to float 16 error ! ")
+    if withTranspose {
+      let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
+      let n = dim[0]
+      let hwc = numel()/n
+      for j in 0..<hwc {
+        for i in 0..<n {
+          //data[i * hwc + j]
+          transposePointer[j * n + i] = data[i * hwc + j]
         }
+      }
+
+      dim.swapeDimAt(index1: 0, index2: 3)
+      data.release()
+      data.pointer = transposePointer
     }
     
-    func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
-        guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
-            fatalError(" not support yet ")
-        }
-        
-        
-        let precisionSize: Int
-        switch precision {
-        case .Float32:
-            precisionSize = 4
-        case .Float16:
-            precisionSize = 2
-        }
-        
-        if dim.cout() == 4 {
-            if layout == .NHWC {
-                let C = dim[3]
-                let cSlices = (C + 3) / 4
-                let paddedC = cSlices * 4
-                let count = paddedC * dim[0] * dim[1] * dim[2]
-                if C == paddedC {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
-                    }
-                } else if C == 1 {
-                    buffer = device.makeBuffer(length: numel() * precisionSize)
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-                    }
-                } else {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
-                    var tmpPointer = floatPointer
-                    var dstPtr = convertedPointer
-                    for _ in 0..<dim[0] * dim[1] * dim[2] {
-                        for j in 0..<paddedC {
-                            if j < C {
-                                dstPtr[j] = tmpPointer[j]
-                            }
-                        }
-                        tmpPointer += C
-                        dstPtr += paddedC
-                    }
-                    
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
-                    }
-                    
-                    convertedPointer.deinitialize(count: count)
-                    convertedPointer.deallocate()
-                }
-            }
-        } else if dim.cout() == 1 {
-            buffer = device.makeBuffer(length: numel() * precisionSize)
-            switch precision {
-            case .Float32:
-                buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-            case .Float16:
-                float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-            }
-        } else {
-            fatalError(" not support !")
-        }
-        //TODO: release
-        data.release()
+    guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
+      fatalError(" not support yet ")
     }
     
-    var width: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[1]
-            } else {
-                fatalError()
-            }
-        }
+    let precisionSize: Int
+    switch precision {
+    case .Float32:
+      precisionSize = 4
+    case .Float16:
+      precisionSize = 2
     }
     
-    var height: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[2]
-            } else {
-                fatalError()
+    if dim.cout() == 4 {
+      if layout == DataLayout.NHWC() {
+        let C = dim[3]
+        let cSlices = (C + 3) / 4
+        let paddedC = cSlices * 4
+        let count = paddedC * dim[0] * dim[1] * dim[2]
+        if C == paddedC {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+          }
+        } else if C == 1 {
+          buffer = device.makeBuffer(length: numel() * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+          }
+        } else {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+          var tmpPointer = floatPointer
+          var dstPtr = convertedPointer
+          for _ in 0..<dim[0] * dim[1] * dim[2] {
+            for j in 0..<paddedC {
+              if j < C {
+                dstPtr[j] = tmpPointer[j]
+              } else {
+                dstPtr[j] = 0
+              }
             }
+            tmpPointer += C
+            dstPtr += paddedC
+          }
+          
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+          }
+          
+          convertedPointer.deinitialize(count: count)
+          convertedPointer.deallocate()
         }
-    }
-    
-    var channel: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[3]
-            } else {
-                fatalError()
+      } else {
+        let C = dim[3]
+        let cSlices = (C + 3) / 4
+        let paddedC = cSlices * 4
+        let count = paddedC * dim[0] * dim[1] * dim[2]
+        if C == paddedC {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+          }
+        } else if C == 1 {
+          fatalError(" not support ")
+        } else {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+          var tmpPointer = floatPointer
+          var dstPtr = convertedPointer
+          for _ in 0..<dim[0] * dim[1] * dim[2] {
+            for j in 0..<paddedC {
+              if j < C {
+                dstPtr[j] = tmpPointer[j]
+              } else {
+                dstPtr[j] = 0
+              }
             }
+            tmpPointer += C
+            dstPtr += paddedC
+          }
+          
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+          }
+          convertedPointer.deinitialize(count: count)
+          convertedPointer.deallocate()
         }
+      }
+    } else if dim.cout() == 1 {
+      let num = ((numel() + 3) / 4) * 4
+      buffer = device.makeBuffer(length: num * precisionSize)
+      switch precision {
+      case .Float32:
+        buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
+      case .Float16:
+        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
+      }
+    } else {
+      fatalError(" not support !")
     }
-
+    //TODO: release
+    data.release()
+  }
+  
+  var width: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[1]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var height: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[2]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var channel: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[3]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  
+  func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
+    let N = dim[0]
+    let C = dim[1]
+    let H = dim[2]
+    let W = dim[3]
+    let HXW = H * W
+    let CXHXW = C * H * W
     
-    func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
-        let N = dim[0]
-        let C = dim[1]
-        let H = dim[2]
-        let W = dim[3]
-        let HXW = H * W
-        let CXHXW = C * H * W
-        
-        var index: Int = 0
-        for n in 0..<N {
-            for h in 0..<H{
-                for w in 0..<W{
-                    for c in 0..<C{
-                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
-                        index += 1
-                    }
-                }
-            }
+    var index: Int = 0
+    for n in 0..<N {
+      for h in 0..<H{
+        for w in 0..<W{
+          for c in 0..<C{
+            newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
+            index += 1
+          }
         }
-        dim.swapeDimAt(index1: 1, index2: 3)
+      }
     }
+    dim.swapeDimAt(index1: 1, index2: 3)
+  }
 }
 
-
 extension Tensor {
-    
-    var debugDescription: String {
-        var str = "dim: \(dim) \n"
-        str += "MTLBuffer: \(self.buffer) \n"
-        for i in 0..<buffer.length/MemoryLayout<P>.size {
-            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
-        }
-        return str
+  
+  var debugDescription: String {
+    var str = "dim: \(dim) \n"
+    str += "MTLBuffer: \(self.buffer) \n"
+    for i in 0..<buffer.length/MemoryLayout<P>.size {
+      str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
     }
-    
-    func logDataPointer(header: String = "") {
-        print(header)
-        var str = ""
-        str += "data size: \(data.size) \n"
-        str += "dim: \(dim) \n"
-        for i in 0..<numel() {
-            str += " \(data.pointer[i])"
-        }
-        print(str)
+    return str
+  }
+  
+  func logDataPointer(header: String = "") {
+    print(header)
+    var str = ""
+    str += "data size: \(data.size) \n"
+    str += "dim: \(dim) \n"
+    for i in 0..<numel() {
+      str += " \(data.pointer[i])"
     }
-    
-    var description: String {
-        return debugDescription
-    }
-    
+    print(str)
+  }
+  
+  var description: String {
+    return debugDescription
+  }
+  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
index 81894664c5dc4acb1a5edd4485543bb20a285ea4..194d3d3015754cd2faf2dc3f4b4b098d762f2e53 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -16,127 +16,163 @@ import Metal
 import Foundation
 
 class InputTexture {
-    let mtlTexture: MTLTexture
-    let expectDim: Dim
-    init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
-        mtlTexture = inMTLTexture
-        expectDim = inExpectDim
-    }
-    
+  let mtlTexture: MTLTexture
+  let expectDim: Dim
+  init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
+    mtlTexture = inMTLTexture
+    expectDim = inExpectDim
+  }
 }
 
 extension InputTexture {
-    var description: String {
-        get{
-            return mtlTexture.description
-        }
+  var description: String {
+    get{
+      return mtlTexture.description
     }
-    
-    var debugDescription: String {
-        get {
-            return mtlTexture.debugDescription ?? " MetalTexture "
-        }
+  }
+  
+  var debugDescription: String {
+    get {
+      return mtlTexture.debugDescription ?? " MetalTexture "
     }
+  }
 }
 
-public class Texture<P: PrecisionType>: Tensorial {
-    var dim: Dim
-    let textureDesc: MTLTextureDescriptor
-    var metalTexture: MTLTexture
-    
-    init(device: MTLDevice, inDim: Dim, inLayout: DataLayout = .NHWC) {
-        dim = inDim
-        layout = inLayout
-        let tmpTextureDes = MTLTextureDescriptor.init()
-        if inDim.cout() == 1 {
-            tmpTextureDes.width = inDim[0]
-            tmpTextureDes.textureType = .type1D
-        } else if inDim.cout() == 4 {
-            tmpTextureDes.height = inDim[1]
-            tmpTextureDes.width = inDim[2]
-//            print("n : \(inDim[0])")
-//            print(inDim[3] * inDim[0])
-            tmpTextureDes.depth = 1
-            tmpTextureDes.arrayLength = (inDim[3] * inDim[0] + 3)/4
-            tmpTextureDes.textureType = .type2DArray
-        } else if inDim.cout() == 2 {
-            tmpTextureDes.height = 1
-            tmpTextureDes.width = 1
-            tmpTextureDes.depth = 1
-            tmpTextureDes.arrayLength = (inDim[0] * inDim[1] + 3)/4
-            tmpTextureDes.textureType = .type2DArray
-        } else {
-            fatalError(" not suuprt ")
-        }
-        
-        if MemoryLayout<P>.size == 1 {
-            tmpTextureDes.pixelFormat = .rgba8Unorm
-        } else if MemoryLayout<P>.size == 2 {
-            tmpTextureDes.pixelFormat = .rgba16Float
-        } else if MemoryLayout<P>.size == 4 {
-//            tmpTextureDes.pixelFormat = .r32Float
-            tmpTextureDes.pixelFormat = .rgba32Float
 
-        }
-//        tmpTextureDes.pixelFormat = .rgba16Float
+/*
+ 4 维 tensor 存储 texture，要考虑 transpose
+ transpose 之后的维度是 [a, b, c, d]，对应的texture_2darray
+ .width = c
+ .height = b
+ .len = a * d + 3 / 4
+ 
+低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
+ 
+// TODO transpose 对于低维 tensor 的扩展原则。。。
+// [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
+// [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
+// [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
+
+3 维 tensor [a, b, c] 对应的 texture_2darray,
+.width = c
+.height = b
+.len = a + 3 / 4
+ 
+ 2 维 tensor [a, b] 对应的 texture_2darray
+ .width = b + 3 / 4
+ .height = a
+ .len = 1
+ 
+ 1 维 tensor [a] 对应的 texture_2darray
+ .width = a + 3 / 4
+ .height = 1
+ .len = 1
+ */
+
 
-        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
-        tmpTextureDes.storageMode = .shared
-        textureDesc = tmpTextureDes
-        metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
+public class Texture<P: PrecisionType>: Tensorial {
+  var dim: Dim
+  public var tensorDim: Dim
+  public var padToFourDim: Dim
+  private var textureDesc: MTLTextureDescriptor!
+  public var metalTexture: MTLTexture!
+  var transpose: [Int] = [0, 1, 2, 3]
+  
+  func toTensor() -> [Float32] {
+    guard  padToFourDim.cout() == 4 else {
+      fatalError("- not support -")
     }
+    return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+  }
+  
+  func realNHWC() -> [Float32] {
+    guard padToFourDim.cout() == 4 else {
+      fatalError(" - not support - ")
+    }
+    return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+  }
+  
+  func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
+    transpose = inTranspose
+    for i in 0..<(4 - tensorDim.cout()) {
+      if i != inTranspose[i] {
+        fatalError()
+      }
+    }
+    let newDim = transpose.map { padToFourDim[$0] }
     
-//    required public init(inDim: Dim, inLayout: DataLayout = .NHWC, inTexture: MTLTexture) {
-//        dim = inDim
-//        layout = inLayout
-//        metalTexture = inTexture
-//        let tmpTextureDes = MTLTextureDescriptor.init()
-//        
-//        if inDim.cout() == 1 {
-//            tmpTextureDes.width = inDim[0]
-//            tmpTextureDes.textureType = .type1D
-//        } else if inDim.cout() == 2 {
-//            tmpTextureDes.height = inDim[0]
-//            tmpTextureDes.width = inDim[1]
-//            tmpTextureDes.textureType = .type2D
-//        } else if inDim.cout() == 3 {
-//            fatalError(" not support texture dim 3")
-//        } else if inDim.cout() == 4 {
-//            tmpTextureDes.height = inDim[1]
-//            tmpTextureDes.width = inDim[2]
-//            tmpTextureDes.depth = inDim[3] * inDim[1]
-//            tmpTextureDes.textureType = .type2DArray
-//        }
-//        
-//        tmpTextureDes.pixelFormat = .r32Float
-//        tmpTextureDes.storageMode = .shared
-//        textureDesc = tmpTextureDes
-//        let device = MTLCreateSystemDefaultDevice()
-//        metalTexture = device!.makeTexture(descriptor: tmpTextureDes)!
-//    }
+    let newLayout = transpose.map { layout.layoutWithDim[$0] }
     
-//    init() {
-//        dim = Dim.init(inDim: [])
-//        layout = .NCHW
-//        let device = MTLCreateSystemDefaultDevice()
-//        textureDesc = MTLTextureDescriptor.init()
-//        metalTexture = device!.makeTexture(descriptor: textureDesc)!
-//    }
+    layout = DataLayout.init(newLayout)
+    dim = Dim.init(inDim: newDim)
     
-    private(set) var layout: DataLayout
-}
-
-extension Texture {
-    public var description: String {
-        return debugDescription
-    }
+    let tmpTextureDes = MTLTextureDescriptor.init()
+    tmpTextureDes.textureType = .type2DArray
+    tmpTextureDes.depth = 1
     
-    public var debugDescription: String{
-        var str = ""
-        str += "Dim: \(dim) \n value:[ "
-        str += "\(metalTexture)"
-        str += " ]"
-        return str
+    switch tensorDim.cout() {
+    case 4:
+      tmpTextureDes.width = newDim[2]
+      tmpTextureDes.height = newDim[1]
+      tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
+    case 3:
+      tmpTextureDes.width = newDim[3]
+      tmpTextureDes.height = newDim[2]
+      tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
+    case 2, 1:
+      tmpTextureDes.width = (newDim[3] + 3) / 4
+      tmpTextureDes.height = newDim[2]
+      tmpTextureDes.arrayLength = 1
+    default:
+      fatalError("unreachable")
+    }
+   
+    if computePrecision == .Float16 {
+      tmpTextureDes.pixelFormat = .rgba16Float
+    } else if computePrecision == .Float32 {
+      tmpTextureDes.pixelFormat = .rgba32Float
     }
     
+    tmpTextureDes.usage = [.shaderRead, .shaderWrite]
+    tmpTextureDes.storageMode = .shared
+    textureDesc = tmpTextureDes
+    metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
+  }
+  
+  init(device: MTLDevice, inDim: Dim) {
+    var fourDim: Dim
+    if inDim.cout() == 4 {
+      fourDim = inDim
+    } else if inDim.cout() < 4 {
+      var fourDimNum: [Int] = []
+      for _ in 0..<(4 - inDim.cout()) {
+        fourDimNum.append(1)
+      }
+      fourDimNum.append(contentsOf: inDim.dims)
+      fourDim = Dim.init(inDim: fourDimNum)
+    } else {
+      fatalError(" not support ")
+    }
+    tensorDim = inDim
+    dim = fourDim
+    padToFourDim = fourDim
+    layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
+  }
+  
+  private(set) var layout: DataLayout
+}
+
+extension Texture {
+  public var description: String {
+    return debugDescription
+  }
+  
+  public var debugDescription: String{
+    var str = ""
+    str += "Dim: \(dim) \n value:[ "
+    str += "\(metalTexture)"
+    str += " ]"
+    return str
+  }
+  
 }
diff --git a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
index ffa44be38a4c3a1f3109c51b3d15506591f2de2e..50b60e9fe6c973b675a97e16c3c15af2b72e3fc4 100644
--- a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
+++ b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
@@ -14,12 +14,15 @@
 
 #pragma once
 
+#import "PaddleMobileCPU.h"
+#import "CPUCompute.h"
+#import "PaddleMobileGPU.h"
 #import <UIKit/UIKit.h>
 
 //! Project version number for paddle_mobile.
-FOUNDATION_EXPORT double paddle_mobileVersionNumber;
+//FOUNDATION_EXPORT double paddle_mobileVersionNumber;
 
 //! Project version string for paddle_mobile.
-FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];
+//FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];
 
 
diff --git a/python/tools/imagetools/imagetools.py b/python/tools/imagetools/imagetools.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a4432858007d6858f2728815670cfd1ed5ec786
--- /dev/null
+++ b/python/tools/imagetools/imagetools.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+import cv2
+from array import array
+
+
+def resize_take_rgbs(path, shape_h_w):
+    print '--------------resize_take_rgbs-----------------begin'
+    image = cv2.imread(path)
+    # print image.shape
+    cv2.imshow("before", image)
+
+    print_rgb(image[0, 0])
+    # image len may be for .just check it
+    # image.resize(shape_h_w)
+
+    image = cv2.resize(image, (shape_h_w[0], shape_h_w[1]))
+
+    cv2.imshow("after", image)
+    print image.shape
+    height = shape_h_w[0]
+    width = shape_h_w[1]
+
+    rs_ = []
+    gs_ = []
+    bs_ = []
+    for h in range(0, height):
+        for w in range(0, width):
+            bs_.append(image[h, w, 0])
+            gs_.append(image[h, w, 1])
+            rs_.append(image[h, w, 2])
+
+    # print image[2, 2, 0]/255.
+    print len(bs_)
+    print len(gs_)
+    print len(rs_)
+    print '--------------resize_take_rgbs-----------------end'
+    return bs_, gs_, rs_
+
+
+def print_rgb((b, g, r)):
+    print "像素 - R:%d,G:%d,B:%d" % (r, g, b)  # 显示像素值
+    #
+    # image[0, 0] = (100, 150, 200)  # 更改位置(0,0)处的像素
+    #
+    # (b, g, r) = image[0, 0]  # 再次读取(0,0)像素
+    # print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b)  # 显示更改后的像素值
+    #
+    # corner = image[0:100, 0:100]  # 读取像素块
+    # cv2.imshow("Corner", corner)  # 显示读取的像素块
+    #
+    # image[0:100, 0:100] = (0, 255, 0);  # 更改读取的像素块
+    #
+    # cv2.imshow("Updated", image)  # 显示图像
+    #
+    # cv2.waitKey(0)  # 程序暂停
+
+
+def save_to_file(to_file_name, array):
+    to_file = open(to_file_name, "wb")
+    array.tofile(to_file)
+    to_file.close()
diff --git a/python/tools/imagetools/img2nchw.py b/python/tools/imagetools/img2nchw.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ca456a1b1b5d20b92d0aaa51b01abb352c1d54
--- /dev/null
+++ b/python/tools/imagetools/img2nchw.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+import cv2
+from array import array
+import imagetools as tools
+from enum import Enum
+
+
+class ChannelType(Enum):
+    RGB = 0,
+    BGR = 1
+
+
+def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR):
+    print '--------------combine_bgrs_nchw-----------------begin'
+    print "scale: %f" % scale
+    print means_b_g_r
+    # print len(bgrs)
+    bs = bgrs[0]
+    gs = bgrs[1]
+    rs = bgrs[2]
+
+    assert len(bs) == len(gs) == len(rs)
+    print len(bs)
+    bgrs_float_array = array('f')
+
+    if channel_type == ChannelType.BGR:
+        print 'bgr'
+        for i in range(0, len(bs)):
+            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+        for i in range(0, len(gs)):
+            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        for i in range(0, len(rs)):
+            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+    elif channel_type == ChannelType.RGB:
+        print 'rgb'
+
+        for i in range(0, len(rs)):
+            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+        for i in range(0, len(gs)):
+            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        for i in range(0, len(bs)):
+            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+
+    print len(bgrs_float_array)
+
+    print '------------------'
+    print bgrs_float_array[0]
+    print bgrs_float_array[416 * 416 * 2 + 416 * 2 + 2]
+
+    # for i in range(0, 9):
+    #     print'bs %d' % i
+    #     print bs[i] / 255.
+
+    print bs[416 * 2 + 2] / 255.
+    print '--------------combine_bgrs_nchw-----------------end'
+
+    return bgrs_float_array
+
+
+# bgrs = tools.resize_take_rgbs('banana.jpeg', (224, 224, 3))
+# array = combine_bgrs_nchw(bgrs, (103.94, 116.78, 123.68), 0.017, array,ChannelType.BGR)
+# tools.save_to_file('banana_1_3_224_224_nchw_float')
+
+# cv2.waitKey(0)
+
+
+bgrs = tools.resize_take_rgbs('datas/newyolo.jpg', (416, 416, 3))
+array = combine_bgrs_nchw(bgrs, (0, 0, 0), 1. / 255, ChannelType.RGB)
+tools.save_to_file('datas/desktop_1_3_416_416_nchw_float', array)
diff --git a/python/tools/imagetools/img2nhwc.py b/python/tools/imagetools/img2nhwc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c982fe303ecde08a9de1827ca67024567322d47f
--- /dev/null
+++ b/python/tools/imagetools/img2nhwc.py
@@ -0,0 +1,34 @@
+# coding=utf-8
+import cv2
+from array import array
+import imagetools as tools
+
+
+def combine_bgrs_nhwc(bgrs, means_b_g_r, scale):
+    print "scale: %f" % scale
+    print means_b_g_r
+    # print len(bgrs)
+    bs = bgrs[0]
+    gs = bgrs[1]
+    rs = bgrs[2]
+    assert len(bs) == len(gs) == len(rs)
+    # print len(bs)
+    bgrs_float_array = array('f')
+    for i in range(0, len(bs)):
+        bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+        bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+
+    print len(bgrs_float_array)
+
+    print '------------------'
+    print bgrs_float_array[0]
+    print bgrs_float_array[999]
+    return bgrs_float_array
+
+
+bgrs = tools.resize_take_rgbs('newyolo_1.jpg', (416, 416, 3))
+array = combine_bgrs_nhwc(bgrs, (0, 0, 0), 1.0 / 255)
+tools.save_to_file('desktop_1_3_416_416_nhwc_float', array)
+
+cv2.waitKey(0)
diff --git a/python/tools/imagetools/numpy2binary.py b/python/tools/imagetools/numpy2binary.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd4bc6e10074183b8dcee4122860c4140ff54229
--- /dev/null
+++ b/python/tools/imagetools/numpy2binary.py
@@ -0,0 +1,47 @@
+# coding=utf-8
+
+# 这个脚本是可以将numpy合并到二进制
+import cv2
+import numpy as np
+import imagetools as tools
+from array import array
+
+#
+# image = cv2.imread(path)
+# print image.shape
+#
+# print_rgb(image[0, 0])
+# # image len may be for .just check it
+# image.resize(shape_h_w)
+
+
+data = np.fromfile('datas/img.res')
+print data.size
+print data[0]
+
+data.reshape(1, 3, 416, 416)
+out_array = array('f')
+print'--------------------'
+print data.size
+print data[0]
+
+print '如果是nhwc --------'
+# rgb rgb rgb rgb rgb
+print data[416 * 3 * 2 + 3 * 2 + 2]
+# print data[2]
+
+print '如果是nchw --------'
+# rgb rgb rgb rgb rgb
+print data[416 * 416 * 2 + 416 * 2 + 2]
+# print data[2]
+
+# 明明是nchw
+
+for i in range(0, data.size):
+    out_array.append(data[i])
+
+print len(out_array)
+
+print out_array[416 * 416 * 2 + 416 * 2 + 2]
+
+tools.save_to_file('datas/in_put_1_3_416_416_2', out_array)
diff --git a/python/tools/mdl2fluid/float2halffloat.py b/python/tools/mdl2fluid/float2halffloat.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df8d43f9548429cef5d49f72fb07f3cef264834
--- /dev/null
+++ b/python/tools/mdl2fluid/float2halffloat.py
@@ -0,0 +1,70 @@
+# encoding:utf-8
+import math
+import re
+
+
+def Real2HalfFloat(data):
+    MINNUM = -65536
+    MAXNUM = 65535
+    FloatVal = 0
+    if data:
+        if data < MINNUM:
+            data = MINNUM
+        if data > MAXNUM:
+            data = MAXNUM
+
+        sign = 0
+        if data < 0:
+            sign = 1
+            data = -data
+
+        exp = math.floor((math.log2(data)))
+        expout = exp + 16
+
+        Mantial = round(data / pow(2, exp - 10)) - 1024
+
+        if expout <= 0:
+            FloatVal = 0
+        else:
+            FloatVal = sign * 32768 + expout * 1024 + Mantial
+    return FloatVal
+
+
+def ReadCfloatData(sourcefile):
+    input = []
+    with open(sourcfile, 'r') as f:
+        for line in f.readlines():
+            line = line.strip()
+            line = re.sub('\s+', ' ', line)  # 两个数字间多个空格
+            input.append(line.split(' '))
+    destfile = sourcefile.replace('.dat', '')
+    destfile = destfile.replace('.txt', '')
+    destfile += 'Out.dat'
+    with open(destfile, 'w') as fw:
+        for i in range(len(input)):
+            if len(input[i]) == 2:
+                real = Real2HalfFloat(float(input[i][0]))
+                imag = Real2HalfFloat(float(input[i][1]))
+                result = real * 65536 + imag
+                if imag and not real:
+                    fw.write('0x0000' + "%X" % result + '\n')
+                elif not imag and not real:
+                    fw.write('0x00000000' + '\n')
+                else:
+                    fw.write('0x' + "%X" % result + '\n')
+            elif len(input[i]) == 1:
+                result = Real2HalfFloat(float(input[i][0]))
+                if result:
+                    fw.write('0x' + "%X" % result + '\n')
+                else:
+                    fw.write('0x0000' + '\n')
+
+
+if __name__ == '__main__':
+    print('Tips: Input number 0 if you want to exit!\n')
+    while True:
+        sourcfile = input("input source file:\n")
+        if sourcfile is '0':
+            break
+        ReadCfloatData(sourcfile)
+        print('Transfer Success!')
diff --git a/python/tools/mdl2fluid/framework.proto b/python/tools/mdl2fluid/framework.proto
new file mode 100644
index 0000000000000000000000000000000000000000..07bfef1c2a69c236ac86732b2dbc00d8abb6334b
--- /dev/null
+++ b/python/tools/mdl2fluid/framework.proto
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package paddle_mobile.framework.proto;
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
+  LONG = 9;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
+    optional int64 l = 13;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
+
+message VarType {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR = 7;
+    SELECTED_ROWS = 8;
+    FEED_MINIBATCH = 9;
+    FETCH_LIST = 10;
+    STEP_SCOPES = 11;
+    LOD_RANK_TABLE = 12;
+    LOD_TENSOR_ARRAY = 13;
+    PLACE_LIST = 14;
+    READER = 15;
+    CHANNEL = 16;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
+    TUPLE = 18;
+  }
+
+  required Type type = 1;
+
+  message TensorDesc {
+    // Should only be PODType. Is enforced in C++
+    required Type data_type = 1;
+    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  }
+  optional TensorDesc selected_rows = 2;
+
+  message LoDTensorDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorDesc lod_tensor = 3;
+
+  message LoDTensorArrayDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorArrayDesc tensor_array = 4;
+
+  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+  optional ReaderDesc reader = 5;
+
+  message ChannelDesc {
+    required Type data_type = 1;
+    required int64 capacity = 2;
+  }
+  optional ChannelDesc channel = 6;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
+}
+
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+}
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+  optional int32 forward_block_idx = 5 [ default = -1 ];
+}
+
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
+message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/python/tools/mdl2fluid/framework_pb2.py b/python/tools/mdl2fluid/framework_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a43deebc91d42e9eb38cf9940020238041d81da
--- /dev/null
+++ b/python/tools/mdl2fluid/framework_pb2.py
@@ -0,0 +1,1141 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: framework.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='framework.proto',
+  package='paddle_mobile.framework.proto',
+  syntax='proto2',
+  serialized_pb=_b('\n\x0f\x66ramework.proto\x12\x1dpaddle_mobile.framework.proto\"\xe5\x03\n\x06OpDesc\x12\x0c\n\x04type\x18\x03 \x02(\t\x12\x39\n\x06inputs\x18\x01 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12:\n\x07outputs\x18\x02 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12\x39\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpDesc.Attr\x12\x18\n\tis_target\x18\x05 \x01(\x08:\x05\x66\x61lse\x1a\xd3\x01\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\t\n\x01i\x18\x03 \x01(\x05\x12\t\n\x01\x66\x18\x04 \x01(\x02\x12\t\n\x01s\x18\x05 \x01(\t\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0e\n\x06\x66loats\x18\x07 \x03(\x02\x12\x0f\n\x07strings\x18\x08 \x03(\t\x12\t\n\x01\x62\x18\n \x01(\x08\x12\r\n\x05\x62ools\x18\x0b \x03(\x08\x12\x11\n\tblock_idx\x18\x0c \x01(\x05\x12\t\n\x01l\x18\r \x01(\x03\x1a+\n\x03Var\x12\x11\n\tparameter\x18\x01 \x02(\t\x12\x11\n\targuments\x18\x02 \x03(\t\"\xcf\x03\n\x07OpProto\x12\x0c\n\x04type\x18\x01 \x02(\t\x12:\n\x06inputs\x18\x02 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12;\n\x07outputs\x18\x03 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12:\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32+.paddle_mobile.framework.proto.OpProto.Attr\x12\x0f\n\x07\x63omment\x18\x05 \x02(\t\x1ax\n\x03Var\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0f\n\x07\x63omment\x18\x02 \x02(\t\x12\x19\n\nduplicable\x18\x03 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cintermediate\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1a\n\x0b\x64ispensable\x18\x05 \x01(\x08:\x05\x66\x61lse\x1av\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\x0f\n\x07\x63omment\x18\x03 \x02(\t\x12\x18\n\tgenerated\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xb9\n\n\x07VarType\x12\x39\n\x04type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12H\n\rselected_rows\x18\x02 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12H\n\nlod_tensor\x18\x03 \x01(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x12O\n\x0ctensor_array\x18\x04 \x01(\x0b\x32\x39.paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc\x12\x41\n\x06reader\x18\x05 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.ReaderDesc\x12\x43\n\x07\x63hannel\x18\x06 \x01(\x0b\x32\x32.paddle_mobile.framework.proto.VarType.ChannelDesc\x12;\n\x05tuple\x18\x07 \x01(\x0b\x32,.paddle_mobile.framework.proto.VarType.Tuple\x1aZ\n\nTensorDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x1ah\n\rLoDTensorDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1am\n\x12LoDTensorArrayDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1aV\n\nReaderDesc\x12H\n\nlod_tensor\x18\x01 \x03(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x1a_\n\x0b\x43hannelDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x10\n\x08\x63\x61pacity\x18\x02 \x02(\x03\x1aJ\n\x05Tuple\x12\x41\n\x0c\x65lement_type\x18\x01 \x03(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\"\x8e\x02\n\x04Type\x12\x08\n\x04\x42OOL\x10\x00\x12\t\n\x05INT16\x10\x01\x12\t\n\x05INT32\x10\x02\x12\t\n\x05INT64\x10\x03\x12\x08\n\x04\x46P16\x10\x04\x12\x08\n\x04\x46P32\x10\x05\x12\x08\n\x04\x46P64\x10\x06\x12\x0e\n\nLOD_TENSOR\x10\x07\x12\x11\n\rSELECTED_ROWS\x10\x08\x12\x12\n\x0e\x46\x45\x45\x44_MINIBATCH\x10\t\x12\x0e\n\nFETCH_LIST\x10\n\x12\x0f\n\x0bSTEP_SCOPES\x10\x0b\x12\x12\n\x0eLOD_RANK_TABLE\x10\x0c\x12\x14\n\x10LOD_TENSOR_ARRAY\x10\r\x12\x0e\n\nPLACE_LIST\x10\x0e\x12\n\n\x06READER\x10\x0f\x12\x0b\n\x07\x43HANNEL\x10\x10\x12\x07\n\x03RAW\x10\x11\x12\t\n\x05TUPLE\x10\x12\"i\n\x07VarDesc\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x34\n\x04type\x18\x02 \x02(\x0b\x32&.paddle_mobile.framework.proto.VarType\x12\x1a\n\x0bpersistable\x18\x03 \x01(\x08:\x05\x66\x61lse\"\xb5\x01\n\tBlockDesc\x12\x0b\n\x03idx\x18\x01 \x02(\x05\x12\x12\n\nparent_idx\x18\x02 \x02(\x05\x12\x34\n\x04vars\x18\x03 \x03(\x0b\x32&.paddle_mobile.framework.proto.VarDesc\x12\x32\n\x03ops\x18\x04 \x03(\x0b\x32%.paddle_mobile.framework.proto.OpDesc\x12\x1d\n\x11\x66orward_block_idx\x18\x05 \x01(\x05:\x02-1\"G\n\x0bProgramDesc\x12\x38\n\x06\x62locks\x18\x01 \x03(\x0b\x32(.paddle_mobile.framework.proto.BlockDesc*}\n\x08\x41ttrType\x12\x07\n\x03INT\x10\x00\x12\t\n\x05\x46LOAT\x10\x01\x12\n\n\x06STRING\x10\x02\x12\x08\n\x04INTS\x10\x03\x12\n\n\x06\x46LOATS\x10\x04\x12\x0b\n\x07STRINGS\x10\x05\x12\x0b\n\x07\x42OOLEAN\x10\x06\x12\x0c\n\x08\x42OOLEANS\x10\x07\x12\t\n\x05\x42LOCK\x10\x08\x12\x08\n\x04LONG\x10\tB\x02H\x03')
+)
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+_ATTRTYPE = _descriptor.EnumDescriptor(
+  name='AttrType',
+  full_name='paddle_mobile.framework.proto.AttrType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='INT', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FLOAT', index=1, number=1,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STRING', index=2, number=2,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INTS', index=3, number=3,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FLOATS', index=4, number=4,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STRINGS', index=5, number=5,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BOOLEAN', index=6, number=6,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BOOLEANS', index=7, number=7,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BLOCK', index=8, number=8,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LONG', index=9, number=9,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=2708,
+  serialized_end=2833,
+)
+_sym_db.RegisterEnumDescriptor(_ATTRTYPE)
+
+AttrType = enum_type_wrapper.EnumTypeWrapper(_ATTRTYPE)
+INT = 0
+FLOAT = 1
+STRING = 2
+INTS = 3
+FLOATS = 4
+STRINGS = 5
+BOOLEAN = 6
+BOOLEANS = 7
+BLOCK = 8
+LONG = 9
+
+
+_VARTYPE_TYPE = _descriptor.EnumDescriptor(
+  name='Type',
+  full_name='paddle_mobile.framework.proto.VarType.Type',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='BOOL', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INT16', index=1, number=1,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INT32', index=2, number=2,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INT64', index=3, number=3,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FP16', index=4, number=4,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FP32', index=5, number=5,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FP64', index=6, number=6,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LOD_TENSOR', index=7, number=7,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='SELECTED_ROWS', index=8, number=8,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FEED_MINIBATCH', index=9, number=9,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FETCH_LIST', index=10, number=10,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STEP_SCOPES', index=11, number=11,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LOD_RANK_TABLE', index=12, number=12,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LOD_TENSOR_ARRAY', index=13, number=13,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='PLACE_LIST', index=14, number=14,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='READER', index=15, number=15,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='CHANNEL', index=16, number=16,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='RAW', index=17, number=17,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='TUPLE', index=18, number=18,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=2072,
+  serialized_end=2342,
+)
+_sym_db.RegisterEnumDescriptor(_VARTYPE_TYPE)
+
+
+_OPDESC_ATTR = _descriptor.Descriptor(
+  name='Attr',
+  full_name='paddle_mobile.framework.proto.OpDesc.Attr',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.OpDesc.Attr.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpDesc.Attr.type', index=1,
+      number=2, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='i', full_name='paddle_mobile.framework.proto.OpDesc.Attr.i', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='f', full_name='paddle_mobile.framework.proto.OpDesc.Attr.f', index=3,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='s', full_name='paddle_mobile.framework.proto.OpDesc.Attr.s', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='ints', full_name='paddle_mobile.framework.proto.OpDesc.Attr.ints', index=5,
+      number=6, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='floats', full_name='paddle_mobile.framework.proto.OpDesc.Attr.floats', index=6,
+      number=7, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='strings', full_name='paddle_mobile.framework.proto.OpDesc.Attr.strings', index=7,
+      number=8, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='b', full_name='paddle_mobile.framework.proto.OpDesc.Attr.b', index=8,
+      number=10, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='bools', full_name='paddle_mobile.framework.proto.OpDesc.Attr.bools', index=9,
+      number=11, type=8, cpp_type=7, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='block_idx', full_name='paddle_mobile.framework.proto.OpDesc.Attr.block_idx', index=10,
+      number=12, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='l', full_name='paddle_mobile.framework.proto.OpDesc.Attr.l', index=11,
+      number=13, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=280,
+  serialized_end=491,
+)
+
+_OPDESC_VAR = _descriptor.Descriptor(
+  name='Var',
+  full_name='paddle_mobile.framework.proto.OpDesc.Var',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='parameter', full_name='paddle_mobile.framework.proto.OpDesc.Var.parameter', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='arguments', full_name='paddle_mobile.framework.proto.OpDesc.Var.arguments', index=1,
+      number=2, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=493,
+  serialized_end=536,
+)
+
+_OPDESC = _descriptor.Descriptor(
+  name='OpDesc',
+  full_name='paddle_mobile.framework.proto.OpDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpDesc.type', index=0,
+      number=3, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='inputs', full_name='paddle_mobile.framework.proto.OpDesc.inputs', index=1,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='outputs', full_name='paddle_mobile.framework.proto.OpDesc.outputs', index=2,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='attrs', full_name='paddle_mobile.framework.proto.OpDesc.attrs', index=3,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='is_target', full_name='paddle_mobile.framework.proto.OpDesc.is_target', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[_OPDESC_ATTR, _OPDESC_VAR, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=51,
+  serialized_end=536,
+)
+
+
+_OPPROTO_VAR = _descriptor.Descriptor(
+  name='Var',
+  full_name='paddle_mobile.framework.proto.OpProto.Var',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.OpProto.Var.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Var.comment', index=1,
+      number=2, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='duplicable', full_name='paddle_mobile.framework.proto.OpProto.Var.duplicable', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='intermediate', full_name='paddle_mobile.framework.proto.OpProto.Var.intermediate', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='dispensable', full_name='paddle_mobile.framework.proto.OpProto.Var.dispensable', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=762,
+  serialized_end=882,
+)
+
+_OPPROTO_ATTR = _descriptor.Descriptor(
+  name='Attr',
+  full_name='paddle_mobile.framework.proto.OpProto.Attr',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.OpProto.Attr.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpProto.Attr.type', index=1,
+      number=2, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Attr.comment', index=2,
+      number=3, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='generated', full_name='paddle_mobile.framework.proto.OpProto.Attr.generated', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=884,
+  serialized_end=1002,
+)
+
+_OPPROTO = _descriptor.Descriptor(
+  name='OpProto',
+  full_name='paddle_mobile.framework.proto.OpProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpProto.type', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='inputs', full_name='paddle_mobile.framework.proto.OpProto.inputs', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='outputs', full_name='paddle_mobile.framework.proto.OpProto.outputs', index=2,
+      number=3, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='attrs', full_name='paddle_mobile.framework.proto.OpProto.attrs', index=3,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='comment', full_name='paddle_mobile.framework.proto.OpProto.comment', index=4,
+      number=5, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[_OPPROTO_VAR, _OPPROTO_ATTR, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=539,
+  serialized_end=1002,
+)
+
+
+_VARTYPE_TENSORDESC = _descriptor.Descriptor(
+  name='TensorDesc',
+  full_name='paddle_mobile.framework.proto.VarType.TensorDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='data_type', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.data_type', index=0,
+      number=1, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='dims', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.dims', index=1,
+      number=2, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1501,
+  serialized_end=1591,
+)
+
+_VARTYPE_LODTENSORDESC = _descriptor.Descriptor(
+  name='LoDTensorDesc',
+  full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.tensor', index=0,
+      number=1, type=11, cpp_type=10, label=2,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.lod_level', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1593,
+  serialized_end=1697,
+)
+
+_VARTYPE_LODTENSORARRAYDESC = _descriptor.Descriptor(
+  name='LoDTensorArrayDesc',
+  full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.tensor', index=0,
+      number=1, type=11, cpp_type=10, label=2,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.lod_level', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1699,
+  serialized_end=1808,
+)
+
+_VARTYPE_READERDESC = _descriptor.Descriptor(
+  name='ReaderDesc',
+  full_name='paddle_mobile.framework.proto.VarType.ReaderDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.ReaderDesc.lod_tensor', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1810,
+  serialized_end=1896,
+)
+
+_VARTYPE_CHANNELDESC = _descriptor.Descriptor(
+  name='ChannelDesc',
+  full_name='paddle_mobile.framework.proto.VarType.ChannelDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='data_type', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.data_type', index=0,
+      number=1, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='capacity', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.capacity', index=1,
+      number=2, type=3, cpp_type=2, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1898,
+  serialized_end=1993,
+)
+
+_VARTYPE_TUPLE = _descriptor.Descriptor(
+  name='Tuple',
+  full_name='paddle_mobile.framework.proto.VarType.Tuple',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='element_type', full_name='paddle_mobile.framework.proto.VarType.Tuple.element_type', index=0,
+      number=1, type=14, cpp_type=8, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1995,
+  serialized_end=2069,
+)
+
+_VARTYPE = _descriptor.Descriptor(
+  name='VarType',
+  full_name='paddle_mobile.framework.proto.VarType',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.VarType.type', index=0,
+      number=1, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='selected_rows', full_name='paddle_mobile.framework.proto.VarType.selected_rows', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.lod_tensor', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='tensor_array', full_name='paddle_mobile.framework.proto.VarType.tensor_array', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='reader', full_name='paddle_mobile.framework.proto.VarType.reader', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='channel', full_name='paddle_mobile.framework.proto.VarType.channel', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='tuple', full_name='paddle_mobile.framework.proto.VarType.tuple', index=6,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[_VARTYPE_TENSORDESC, _VARTYPE_LODTENSORDESC, _VARTYPE_LODTENSORARRAYDESC, _VARTYPE_READERDESC, _VARTYPE_CHANNELDESC, _VARTYPE_TUPLE, ],
+  enum_types=[
+    _VARTYPE_TYPE,
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1005,
+  serialized_end=2342,
+)
+
+
+_VARDESC = _descriptor.Descriptor(
+  name='VarDesc',
+  full_name='paddle_mobile.framework.proto.VarDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.VarDesc.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.VarDesc.type', index=1,
+      number=2, type=11, cpp_type=10, label=2,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='persistable', full_name='paddle_mobile.framework.proto.VarDesc.persistable', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2344,
+  serialized_end=2449,
+)
+
+
+_BLOCKDESC = _descriptor.Descriptor(
+  name='BlockDesc',
+  full_name='paddle_mobile.framework.proto.BlockDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='idx', full_name='paddle_mobile.framework.proto.BlockDesc.idx', index=0,
+      number=1, type=5, cpp_type=1, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='parent_idx', full_name='paddle_mobile.framework.proto.BlockDesc.parent_idx', index=1,
+      number=2, type=5, cpp_type=1, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='vars', full_name='paddle_mobile.framework.proto.BlockDesc.vars', index=2,
+      number=3, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='ops', full_name='paddle_mobile.framework.proto.BlockDesc.ops', index=3,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='forward_block_idx', full_name='paddle_mobile.framework.proto.BlockDesc.forward_block_idx', index=4,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2452,
+  serialized_end=2633,
+)
+
+
+_PROGRAMDESC = _descriptor.Descriptor(
+  name='ProgramDesc',
+  full_name='paddle_mobile.framework.proto.ProgramDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='blocks', full_name='paddle_mobile.framework.proto.ProgramDesc.blocks', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2635,
+  serialized_end=2706,
+)
+
+_OPDESC_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
+_OPDESC_ATTR.containing_type = _OPDESC
+_OPDESC_VAR.containing_type = _OPDESC
+_OPDESC.fields_by_name['inputs'].message_type = _OPDESC_VAR
+_OPDESC.fields_by_name['outputs'].message_type = _OPDESC_VAR
+_OPDESC.fields_by_name['attrs'].message_type = _OPDESC_ATTR
+_OPPROTO_VAR.containing_type = _OPPROTO
+_OPPROTO_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
+_OPPROTO_ATTR.containing_type = _OPPROTO
+_OPPROTO.fields_by_name['inputs'].message_type = _OPPROTO_VAR
+_OPPROTO.fields_by_name['outputs'].message_type = _OPPROTO_VAR
+_OPPROTO.fields_by_name['attrs'].message_type = _OPPROTO_ATTR
+_VARTYPE_TENSORDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
+_VARTYPE_TENSORDESC.containing_type = _VARTYPE
+_VARTYPE_LODTENSORDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
+_VARTYPE_LODTENSORDESC.containing_type = _VARTYPE
+_VARTYPE_LODTENSORARRAYDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
+_VARTYPE_LODTENSORARRAYDESC.containing_type = _VARTYPE
+_VARTYPE_READERDESC.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
+_VARTYPE_READERDESC.containing_type = _VARTYPE
+_VARTYPE_CHANNELDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
+_VARTYPE_CHANNELDESC.containing_type = _VARTYPE
+_VARTYPE_TUPLE.fields_by_name['element_type'].enum_type = _VARTYPE_TYPE
+_VARTYPE_TUPLE.containing_type = _VARTYPE
+_VARTYPE.fields_by_name['type'].enum_type = _VARTYPE_TYPE
+_VARTYPE.fields_by_name['selected_rows'].message_type = _VARTYPE_TENSORDESC
+_VARTYPE.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
+_VARTYPE.fields_by_name['tensor_array'].message_type = _VARTYPE_LODTENSORARRAYDESC
+_VARTYPE.fields_by_name['reader'].message_type = _VARTYPE_READERDESC
+_VARTYPE.fields_by_name['channel'].message_type = _VARTYPE_CHANNELDESC
+_VARTYPE.fields_by_name['tuple'].message_type = _VARTYPE_TUPLE
+_VARTYPE_TYPE.containing_type = _VARTYPE
+_VARDESC.fields_by_name['type'].message_type = _VARTYPE
+_BLOCKDESC.fields_by_name['vars'].message_type = _VARDESC
+_BLOCKDESC.fields_by_name['ops'].message_type = _OPDESC
+_PROGRAMDESC.fields_by_name['blocks'].message_type = _BLOCKDESC
+DESCRIPTOR.message_types_by_name['OpDesc'] = _OPDESC
+DESCRIPTOR.message_types_by_name['OpProto'] = _OPPROTO
+DESCRIPTOR.message_types_by_name['VarType'] = _VARTYPE
+DESCRIPTOR.message_types_by_name['VarDesc'] = _VARDESC
+DESCRIPTOR.message_types_by_name['BlockDesc'] = _BLOCKDESC
+DESCRIPTOR.message_types_by_name['ProgramDesc'] = _PROGRAMDESC
+DESCRIPTOR.enum_types_by_name['AttrType'] = _ATTRTYPE
+
+OpDesc = _reflection.GeneratedProtocolMessageType('OpDesc', (_message.Message,), dict(
+
+  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
+    DESCRIPTOR = _OPDESC_ATTR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Attr)
+    ))
+  ,
+
+  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
+    DESCRIPTOR = _OPDESC_VAR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Var)
+    ))
+  ,
+  DESCRIPTOR = _OPDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc)
+  ))
+_sym_db.RegisterMessage(OpDesc)
+_sym_db.RegisterMessage(OpDesc.Attr)
+_sym_db.RegisterMessage(OpDesc.Var)
+
+OpProto = _reflection.GeneratedProtocolMessageType('OpProto', (_message.Message,), dict(
+
+  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
+    DESCRIPTOR = _OPPROTO_VAR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Var)
+    ))
+  ,
+
+  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
+    DESCRIPTOR = _OPPROTO_ATTR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Attr)
+    ))
+  ,
+  DESCRIPTOR = _OPPROTO,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto)
+  ))
+_sym_db.RegisterMessage(OpProto)
+_sym_db.RegisterMessage(OpProto.Var)
+_sym_db.RegisterMessage(OpProto.Attr)
+
+VarType = _reflection.GeneratedProtocolMessageType('VarType', (_message.Message,), dict(
+
+  TensorDesc = _reflection.GeneratedProtocolMessageType('TensorDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_TENSORDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.TensorDesc)
+    ))
+  ,
+
+  LoDTensorDesc = _reflection.GeneratedProtocolMessageType('LoDTensorDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_LODTENSORDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorDesc)
+    ))
+  ,
+
+  LoDTensorArrayDesc = _reflection.GeneratedProtocolMessageType('LoDTensorArrayDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_LODTENSORARRAYDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc)
+    ))
+  ,
+
+  ReaderDesc = _reflection.GeneratedProtocolMessageType('ReaderDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_READERDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ReaderDesc)
+    ))
+  ,
+
+  ChannelDesc = _reflection.GeneratedProtocolMessageType('ChannelDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_CHANNELDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ChannelDesc)
+    ))
+  ,
+
+  Tuple = _reflection.GeneratedProtocolMessageType('Tuple', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_TUPLE,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.Tuple)
+    ))
+  ,
+  DESCRIPTOR = _VARTYPE,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType)
+  ))
+_sym_db.RegisterMessage(VarType)
+_sym_db.RegisterMessage(VarType.TensorDesc)
+_sym_db.RegisterMessage(VarType.LoDTensorDesc)
+_sym_db.RegisterMessage(VarType.LoDTensorArrayDesc)
+_sym_db.RegisterMessage(VarType.ReaderDesc)
+_sym_db.RegisterMessage(VarType.ChannelDesc)
+_sym_db.RegisterMessage(VarType.Tuple)
+
+VarDesc = _reflection.GeneratedProtocolMessageType('VarDesc', (_message.Message,), dict(
+  DESCRIPTOR = _VARDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarDesc)
+  ))
+_sym_db.RegisterMessage(VarDesc)
+
+BlockDesc = _reflection.GeneratedProtocolMessageType('BlockDesc', (_message.Message,), dict(
+  DESCRIPTOR = _BLOCKDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.BlockDesc)
+  ))
+_sym_db.RegisterMessage(BlockDesc)
+
+ProgramDesc = _reflection.GeneratedProtocolMessageType('ProgramDesc', (_message.Message,), dict(
+  DESCRIPTOR = _PROGRAMDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.ProgramDesc)
+  ))
+_sym_db.RegisterMessage(ProgramDesc)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003'))
+# @@protoc_insertion_point(module_scope)
diff --git a/python/tools/mdl2fluid/loader.py b/python/tools/mdl2fluid/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2258e365a84003b7b90ac480abbd9798f48f59
--- /dev/null
+++ b/python/tools/mdl2fluid/loader.py
@@ -0,0 +1,18 @@
+import datetime
+import json
+import os
+
+import google.protobuf as pbg
+import framework_pb2 as framework_pb2
+
+
+def loadmdl(json_path):
+    print('mdl json path : ' + json_path)
+    with open(json_path, 'r') as f:
+        json_dick = json.load(f)
+        # print(json_dick)
+        layers = (json_dick['layer'])
+        for layer in layers:
+            print(layer)
+
+
diff --git a/python/tools/mdl2fluid/mdl2fluid.py b/python/tools/mdl2fluid/mdl2fluid.py
new file mode 100644
index 0000000000000000000000000000000000000000..a57a01d09eaf236fd9f890dcb9e8eead19aa7868
--- /dev/null
+++ b/python/tools/mdl2fluid/mdl2fluid.py
@@ -0,0 +1,335 @@
+import json
+import os
+
+import framework_pb2 as framework_pb2
+import op_types as types
+from swicher import Swichter
+import shutil
+
+
+def load_mdl(mdl_json_path):
+    # print('mdl json path : ' + mdl_json_path)
+    with open(mdl_json_path, 'r') as f:
+        return json.load(f)
+
+
+class Converter:
+    'convert mdlmodel to fluidmodel'
+
+    def __init__(self, mdl_json_path):
+        self.mdl_json_path = mdl_json_path
+        print mdl_json_path
+        self.mdl_json = load_mdl(self.mdl_json_path)
+        self.program_desc = framework_pb2.ProgramDesc()
+        self.weight_list_ = []
+        self.deepwise_weight_list_ = []
+        # print(json_dick)
+        # layers = (json_dick['layer'])
+        # for layer in layers:
+        #     print(layer)
+
+    def convert(self):
+        print 'convert begin.....'
+        # add block_desc
+        block_desc = self.program_desc.blocks.add()
+        block_desc.idx = 0
+        block_desc.parent_idx = -1
+        self.package_ops(block_desc)
+        self.package_vars(block_desc)
+        print 'blocks: '
+        print self.program_desc.blocks
+        print 'convert end.....'
+        desc_serialize_to_string = self.program_desc.SerializeToString()
+        shutil.rmtree('newyolo/')
+        shutil.copytree('multiobjects/float32s_nchw_with_head', 'newyolo/')
+
+        f = open("newyolo/__model__", "wb")
+        f.write(desc_serialize_to_string)
+        f.close()
+
+    def package_ops(self, block_desc):
+
+        self.add_op_feed(block_desc)
+
+        # add ops with layer
+        if 'layer' in self.mdl_json:
+
+            layers_ = self.mdl_json['layer']
+            for layer in layers_:
+                desc_ops_add = block_desc.ops.add()
+
+                # print layer
+                # for i in layer:
+                #     print i
+                if 'name' in layer:
+                    l_name = layer['name']
+                if 'type' in layer:
+                    self.package_ops_type(desc_ops_add, layer)
+
+                if 'weight' in layer:
+                    self.package_ops_weight2inputs(desc_ops_add, layer)
+
+                if 'output' in layer:
+                    self.package_ops_outputs(desc_ops_add, layer)
+
+                if 'input' in layer:
+                    self.package_ops_inputs(desc_ops_add, layer)
+
+                self.package_ops_attrs(desc_ops_add, layer)
+
+        self.add_op_fetch(block_desc)
+
+    def add_op_feed(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        inputs_add.arguments.append('feed')
+        desc_ops_add.type = 'feed'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('data')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    def add_op_fetch(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        inputs_add.arguments.append('conv_pred_87')
+        desc_ops_add.type = 'fetch'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('fetch')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    @staticmethod
+    def package_ops_attrs(desc_ops_add, layer):
+        # print l_params
+        # print desc_ops_add.type
+        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
+            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
+        elif desc_ops_add.type == types.op_fluid_relu:
+            # fusion_conv_add : attrs
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'use_mkldnn'
+            # boolean
+            attrs_add.type = 6
+            attrs_add.b = 0
+
+    @staticmethod
+    def pack_fusion_conv_add_attr(desc_ops_add, layer):
+
+        # fusion_conv_add : attrs
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'workspace_size_MB'
+        # 0-->INT
+        attrs_add.type = 0
+        attrs_add.i = 4096
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'data_format'
+        # 2-->STRING
+        attrs_add.type = 2
+        attrs_add.s = 'AnyLayout'
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_mkldnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 0
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_cudnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 1
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'dilations'
+        # ints
+        attrs_add.type = 3
+        attrs_add.ints.append(1)
+        attrs_add.ints.append(1)
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'axis'
+        # int
+        attrs_add.type = 0
+        attrs_add.i = 1
+
+        if 'param' in layer:
+            l_params = layer['param']
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'paddings'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'strides'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'groups'
+            # int
+            attrs_add.type = 0
+            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
+            # attrs_add.i = 1
+
+        #
+        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
+        #     .get(types.mdl_attrs_key)
+        #
+        #
+        #
+        #
+        # # group stride padding
+        # print '----------------------'
+        # for i, val in enumerate(op_attrs_tupl):
+        #     attrs_add = desc_ops_add.attrs.add()
+        #     attr_name = op_attrs_tupl[i]
+        #     print attr_name
+        #     attrs_add.name = attr_name
+        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
+        #     attrs_add.
+        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
+
+        # for p in l_params:
+        #     attrs_add = desc_ops_add.attrs.add()
+
+    @staticmethod
+    def package_ops_inputs(desc_ops_add, layer):
+        l_inputs = layer['input']
+        for i in l_inputs:
+            inputs_add = desc_ops_add.inputs.add()
+            # print i
+            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
+            inputs_add.arguments.append(i)
+
+    @staticmethod
+    def package_ops_outputs(desc_ops_add, layer):
+        l_outputs = layer['output']
+        for o in l_outputs:
+            # print o
+            outputs_add = desc_ops_add.outputs.add()
+            outputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_outputs_key)
+            outputs_add.arguments.append(o)
+
+    def package_ops_weight2inputs(self, desc_ops_add, layer):
+        l_weights = layer['weight']
+        for w in l_weights:
+            self.weight_list_.append(w)
+
+        if layer['type'] == 'DepthwiseConvolutionLayer':
+            # print l_weights[0]
+            self.deepwise_weight_list_.append(l_weights[0])
+
+        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
+        # print len(op_weight_tup)
+        for i, val in enumerate(op_weight_tup):
+            # print i
+            # print val
+            inputs_add = desc_ops_add.inputs.add()
+            inputs_add.parameter = op_weight_tup[i]
+            inputs_add.arguments.append(l_weights[i])
+
+        # for w in l_weights:
+        #     inputs_add = desc_ops_add.inputs.add()
+        #     # print w
+        #     inputs_add.parameter = op_weight_tup[0]
+        #     inputs_add.arguments.append(w)
+
+    @staticmethod
+    def package_ops_type(desc_ops_add, layer):
+        l_type = layer['type']
+        # print l_type
+        # print mdl2fluid_op_layer_dict.get(l_type)
+        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
+
+    def package_vars(self, block_desc):
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'feed'
+        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
+        vars_add.persistable = 1
+        # fetch
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'fetch'
+        vars_add.type.type = 10  # 10 is fetch list
+        vars_add.persistable = 1
+
+        json_matrix_ = self.mdl_json['matrix']
+        # print json_matrix_
+        for j in json_matrix_:
+            vars_add = block_desc.vars.add()
+            vars_add.name = j
+            vars_add.type.type = 7  # 7 is lodtensor
+            # print j
+            tensor = vars_add.type.lod_tensor.tensor
+            tensor.data_type = 5  # 5 is FP32
+
+            # print json_matrix_
+
+            dims_of_matrix = json_matrix_.get(j)
+            # dims_size = len(dims_of_matrix)
+            # print dims_size
+
+            # if dims_size == 4:
+            #     tensor.dims.append(dims_of_matrix[0])  # N
+            #     tensor.dims.append(dims_of_matrix[3])  # C
+            #     tensor.dims.append(dims_of_matrix[1])  # H
+            #     tensor.dims.append(dims_of_matrix[2])  # W
+            # else:
+
+            # issues in mdl model filter swich n and c
+            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
+                print j
+                tensor.dims.append(dims_of_matrix[1])
+                tensor.dims.append(dims_of_matrix[0])
+                tensor.dims.append(dims_of_matrix[2])
+                tensor.dims.append(dims_of_matrix[3])
+                print tensor.dims
+            else:
+                for dims in dims_of_matrix:
+                    # print dims
+                    tensor.dims.append(dims)
+
+            if j in self.weight_list_:
+                vars_add.persistable = 1
+                dims_size = len(dims_of_matrix)
+                # print dims_size
+                if dims_size == 4:
+                    # convert weight from nhwc to nchw
+                    Swichter().nhwc2nchw_one_slice_add_head(
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp',
+                        dims_of_matrix[0],
+                        dims_of_matrix[1],
+                        dims_of_matrix[2],
+                        dims_of_matrix[3]
+                    )
+                else:
+                    Swichter().copy_add_head(
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp'
+                    )
+            else:
+                vars_add.persistable = 0
+
+
+mdl_path = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/YOLO_Universal.json"
+converter = Converter(mdl_path)
+converter.convert()
diff --git a/python/tools/mdl2fluid/model_combine.py b/python/tools/mdl2fluid/model_combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae3ca8a786dc0d4032deda35c33f44d3d96e983d
--- /dev/null
+++ b/python/tools/mdl2fluid/model_combine.py
@@ -0,0 +1,19 @@
+# coding=utf-8
+import os
+
+path = "yolo_v2_tofile_source/"  # 文件夹目录
+to_file_path = "yolo_v2_tofile_combined/params"
+files = os.listdir(path)  # 得到文件夹下的所有文件名称
+files.sort(cmp=None, key=str.lower)
+to_file = open(to_file_path, "wb")
+
+for file in files:  # 遍历文件夹
+    if not os.path.isdir(file):  # 判断是否是文件夹，不是文件夹才打开
+        f = open(path + "/" + file)  # 打开文件
+        name = f.name
+        print 'name:  ' + name
+        from_file = open(name, "rb")
+        to_file.write(from_file.read())
+        from_file.close()
+
+to_file.close()
diff --git a/python/tools/mdl2fluid/model_reader.py b/python/tools/mdl2fluid/model_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d53350db20739526b77663f791942299d4bc149
--- /dev/null
+++ b/python/tools/mdl2fluid/model_reader.py
@@ -0,0 +1,30 @@
+import os
+
+import framework_pb2 as framework_pb2
+
+
+def read_model(model_path):
+    print('read_model.')
+    path_8 = unicode(model_path, 'utf8')
+
+    try:
+        with open(path_8, "rb") as f_model:
+            print get_file_size(model_path)
+            desc = framework_pb2.ProgramDesc()
+            desc.ParseFromString(f_model.read())
+            print desc
+            # print desc.blocks
+
+    except IOError:
+        print ": File not found.  Creating a new file."
+
+
+def get_file_size(file_path):
+    file_path = unicode(file_path, 'utf8')
+    fsize = os.path.getsize(file_path)
+    fsize = fsize / float(1024 * 1024)
+    return round(fsize, 2)
+
+
+path = "newyolo/__model__"
+read_model(path)
diff --git a/python/tools/mdl2fluid/op_types.py b/python/tools/mdl2fluid/op_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7d78d20835c605dc581ef14ad2d7d5171fea1d
--- /dev/null
+++ b/python/tools/mdl2fluid/op_types.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+
+# mdl layers
+layer_mdl_conv = 'ConvolutionLayer'
+layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
+layer_mdl_relu = 'ReluLayer'
+layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
+
+# fluid ops
+op_fluid_fusion_conv_add = 'fusion_conv_add'
+op_fluid_relu = 'relu'
+
+# dict mdk layer ---  fluid op
+mdl2fluid_op_layer_dict = {
+    layer_mdl_conv: op_fluid_fusion_conv_add,
+    layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
+    layer_mdl_relu: op_fluid_relu,
+    layer_mdl_pointwise_add: op_fluid_fusion_conv_add
+}
+
+mdl_outputs_key = "outputs"
+mdl_inputs_key = "inputs"
+mdl_weight_key = "weights"
+mdl_attrs_key = "params"
+
+# dict of mdl-input _out param  to fluid input out attrs
+fusion_conv_add_dict = {
+    mdl_inputs_key: 'Input',
+    mdl_outputs_key: 'Out',
+    mdl_weight_key: ('Filter', 'Y'),
+    mdl_attrs_key: (
+        # 'workspace_size_MB', 'use_mkldnn', 'use_cudnn', 'data_format','dilations',
+        # dilations =  [1,1]
+        'groups', 'paddings', 'strides'
+        # 'axis'
+    )
+}
+
+relu_dict = {
+    mdl_inputs_key: 'X',
+    mdl_outputs_key: 'Out',
+    mdl_weight_key: ()
+
+}
+# mdl layers  ---  fluid ops
+op_io_dict = {
+    'fusion_conv_add': fusion_conv_add_dict,
+    'relu': relu_dict
+}
+
+# fluid attr key  ---  mdl params key
+fusion_conv_add_attrs_dict = {
+    'paddings': 'pad',
+    'strides': 'stride',
+    'groups': 'group'
+}
+# fluid attr key  ---  mdl params key
+fluid_attrs_type_dict = {
+    'paddings': 0,
+    'strides': 6,
+    'groups': 6
+}
+
+# '': "bias_term",    是不是要add   目前 yolo的模型都是 bias_term = 1
+
+
+# attrs {
+#       name: "axis"
+#       type: INT
+#       i: 1
+#     }
+
+
+# attrs_name = {
+#     'name': "workspace_size_MB",
+#     'type': 'INT',
+#     'i': '4096'
+# }
+# attrs
+# {
+#     name: "data_format"
+#     type: STRING
+#     s: "AnyLayout"
+# }
+# attrs
+# {
+#     name: "use_mkldnn"
+#     type: BOOLEAN
+#     b: false
+# }
+# attrs
+# {
+#     name: "use_cudnn"
+#     type: BOOLEAN
+#     b: true
+# }
+# attrs
+# {
+#     name: "dilations"
+#     type: INTS
+#     ints: 1
+#     ints: 1
+# }
+# attrs
+# {
+#     name: "groups"
+#     type: INT
+#     i: 1
+# }
+# attrs
+# {
+#     name: "paddings"
+#     type: INTS
+#     ints: 0
+#     ints: 0
+# }
+# attrs
+# {
+#     name: "strides"
+#     type: INTS
+#     ints: 1
+#     ints: 1
+# }
diff --git a/python/tools/mdl2fluid/swicher.py b/python/tools/mdl2fluid/swicher.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfe0360fd5b32f5e6fa61f6f05a0a384fb3a1e9b
--- /dev/null
+++ b/python/tools/mdl2fluid/swicher.py
@@ -0,0 +1,115 @@
+from array import array
+
+
+class Swichter:
+    def __init__(self):
+        pass
+
+    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(to_file)
+        from_file.close()
+        to_file.close()
+
+    def copy(self, from_file_name, to_file_name):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+
+    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        tmp_file = open(tmp_file_name, "wb+")
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(tmp_file)
+        tmp_file.close()
+        from_file.close()
+
+        tmp_file = open(tmp_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        tmp = tmp_file.read()
+        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(tmp)
+        tmp_file.close()
+        to_file.close()
+
+    def read_head(self, head_file):
+        from_file = open(head_file, "rb")
+        read = from_file.read(24)
+        # print read
+        from_file.close()
+        # print read
+        return read
+
+    def copy_add_head(self, from_file_name, to_file_name, tmp_file_name):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+        pass
+
+    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
+        print'padding  = %d' % padding
+        from_file = open(from_file_name, "rb")
+        # print len(from_file.read())
+        from_file.seek(padding, 0)
+
+        read = from_file.read()
+        print len(read)
+
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(read)
+        from_file.close()
+        to_file.close()
+        pass
+
+# Swichter().nhwc2nchw_one_slice_add_head(
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp',
+#     32,
+#     3, 3, 3)
+
+# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+
+# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
index ec91946d95be9b4e4384606fd67a69c552166a5e..035797299be25ab54bda5c503bcf3efb1be945aa 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -12,28 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "api.h"
+#include "fpga/api.h"
 #include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <sys/ioctl.h>
+#include <sys/mman.h>
 #include <algorithm>
-#include <cstring>
-#include "bias_scale.h"
-#include "filter.h"
-#include "image.h"
-
+#include <map>
+#include "fpga/bias_scale.h"
+#include "fpga/filter.h"
+#include "fpga/image.h"
 #define FPGA_TEST_MODE
+#define PADDLE_MOBILE_OS_LINUX
 
 namespace paddle_mobile {
 namespace fpga {
 
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
+static std::map<void *, size_t> memory_map;
 
 static inline int do_ioctl(int req, const void *arg) {
 #ifdef PADDLE_MOBILE_OS_LINUX
-  return ioctl(req, (unsigned int64_t)arg);
+  int result = ioctl(fd, req, (uint64_t)arg);
+  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
+  return result;
 #else
   return -1;
 #endif
@@ -48,50 +50,94 @@ int open_device() {
 
 // memory management;
 void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+
 #ifdef PADDLE_MOBILE_OS_LINUX
-  return reinterpret_cast<void *>(
-      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 #else
-  return malloc(size);
+  auto ptr = malloc(size);
 #endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+       << counter << " bytes";
+  return ptr;
 }
 
 void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
 #ifdef PADDLE_MOBILE_OS_LINUX
-  munmap(ptr, 0);
+    munmap(ptr, size);
 #else
-  free(ptr);
+    free(ptr);
 #endif
+    counter += size;
+    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
 }
 
 void fpga_copy(void *dest, const void *src, size_t num) {
   memcpy(dest, src, num);
 }
 
+int fpga_flush(void *address, size_t size) {
+  struct MemoryCacheArgs args = {nullptr};
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+
+int fpga_invalidate(void *address, size_t size) {
+  struct MemoryCacheArgs args = {nullptr};
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
+
 int ComputeFpgaConv(const struct WrapperConvArgs &args) {
 #ifdef FPGA_TEST_MODE
-/*DLOG << "   relu_enabled:" << args.relu_enabled
-     << "   sb_address:" << args.sb_address
-     << "   filter_address:" << args.filter_address
-     << "   filter_num:" << args.filter_num
-     << "   group_num:" << args.group_num;
-DLOG << "   image_address:" << args.image.address
-     << "   image_scale_address:" << args.image.scale_address
-     << "   image_channels:" << args.image.channels
-     << "   image_height:" << args.image.height
-     << "   image_width:" << args.image.width
-     << "   pad_height:" << args.image.pad_height
-     << "   pad_width:" << args.image.pad_width;
-DLOG << "   kernel_height:" << args.kernel.height
-     << "   kernel_width:" << args.kernel.width
-     << "   stride_h:" << args.kernel.stride_h
-     << "   stride_w:" << args.kernel.stride_w;
-DLOG << "   out_address:" << args.output.address
-     << "   out_scale_address:" << args.output.scale_address;*/
+  DLOG << "=============ComputeFPGAConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num
+       << "   split_num:" << args.split_num;
 #endif
+
   int split_num = args.split_num;
   for (int i = 0; i < split_num; i++) {
-    do_ioctl(IOCTL_CONFIG_CONV, &args.conv_args[i]);
+    ComputeBasicConv(args.conv_args[i]);
   }
 
   if (split_num > 1) {
@@ -101,6 +147,7 @@ DLOG << "   out_address:" << args.output.address
 
 int ComputeFpgaPool(const struct PoolingArgs &args) {
 #ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaPool===========";
   DLOG << "   image_address:" << args.image.address
        << "   image_scale_address:" << args.image.scale_address
        << "   image_channels:" << args.image.channels
@@ -121,6 +168,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
 
 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
   DLOG << "   relu_enabled:" << args.relu_enabled << "   const0:" << args.const0
        << "   const1:" << args.const1;
   DLOG << "   image0_address:" << args.image0.address
@@ -145,8 +193,11 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 }
 int PerformBypass(const struct BypassArgs &args) {
 #ifdef FPGA_TEST_MODE
-  DLOG << "   layout_type:" << args.layout_type
-       << "   convert_type:" << args.convert_type;
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
   DLOG << "   image_address:" << args.image.address
        << "   image_scale_address:" << args.image.scale_address
        << "   image_channels:" << args.image.channels
@@ -162,29 +213,71 @@ int PerformBypass(const struct BypassArgs &args) {
 }
 
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
   image::concat_images(args.images_in, args.scales_in, args.image_out,
                        args.scale_out, args.image_num, args.channel_num,
                        args.height, args.width);
   return 0;
 }
 
+int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
+
 void format_image(framework::Tensor *image_tensor) {
   auto dims = image_tensor->dims();
   auto channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = image_tensor->mutable_data<float>();
+  auto data_ptr = image_tensor->data<float>();
   size_t memory_size = channel * height * width * sizeof(float);
-  float *new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
   fpga_copy(new_data, data_ptr, memory_size);
   image::format_image(&new_data, channel, height, width);
   image_tensor->reset_data_ptr(new_data);
 }
 
-void format_ofm(framework::Tensor *ofm_tensor) {
+void format_fp16_ofm(framework::Tensor *ofm_tensor) {
   auto dims = ofm_tensor->dims();
-  auto channel = dims[1], height = dims[2], width = dims[3];
-  size_t memory_size =
-      height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
-  ofm_tensor->reset_data_ptr(fpga_malloc(memory_size));
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto channel = dims[1], height = dims[2], width = dims[3];
+    memory_size =
+        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
+  } else if (dims.size() == 2) {
+    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+void format_fp32_ofm(framework::Tensor *ofm_tensor) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto channel = dims[1], height = dims[2], width = dims[3];
+    memory_size =
+        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
+  } else if (dims.size() == 2) {
+    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
 }
 
 float filter_find_max(framework::Tensor *filter_tensor) {
@@ -200,7 +293,7 @@ int get_plit_num(framework::Tensor *filter_tensor) {
   return filter::calc_split_num(num, div_capacity);
 }
 
-int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) {
+int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
   auto dims = filter_tensor->dims();
   auto chw = dims[1] * dims[2] * dims[3];
   auto num = dims[0];
@@ -218,13 +311,13 @@ int get_aligned_filter_num(int num) {
 
 void format_filter(framework::Tensor *filter_tensor, float max_value,
                    int group_num) {
-  filter_tensor->scale[0] = float(max_value / 127.0);
-  filter_tensor->scale[1] = float(127.0 / max_value);
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
   auto dims = filter_tensor->dims();
   auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->mutable_data<float>();
+  auto data_ptr = filter_tensor->data<float>();
   size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
   fpga_copy(new_data, data_ptr, memory_size);
   filter::format_filter(&new_data, num, channel, height, width, group_num,
                         max_value);
@@ -246,7 +339,7 @@ void format_concat_output(framework::Tensor *out, int height, int width,
 
   sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
   auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
-  auto ddim = framework::make_ddim({-1, sum_channel, height, width});
+  auto ddim = framework::make_ddim({1, sum_channel, height, width});
   out->Resize(ddim);
   out->reset_data_ptr(data_ptr);
 }
@@ -257,15 +350,16 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
                    int padding_h, int padding_w, float *bs_ptr) {
   auto input_ptr = input->data<float>();
   auto filter_ptr = filter->data<float>();
-  auto out_ptr = out->mutable_data<float>();
+  auto out_ptr = out->data<float>();
 
   arg->group_num = (uint32_t)group_num;
-  arg->split_num = (uint32_t)fpga::get_plit_num(filter);
+  // Either group_num or split_num = 1;
+  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
-  arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num *
-                                                       sizeof(fpga::ConvArgs));
+  arg->conv_args =
+      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
 
   arg->concat_arg.image_num = arg->split_num;
   arg->concat_arg.image_out = out_ptr;
@@ -274,15 +368,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
   arg->concat_arg.width = (uint32_t)filter->dims()[3];
 
   int n = arg->split_num;
-  arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
-  arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *));
+  arg->concat_arg.images_in =
+      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
+  arg->concat_arg.scales_in =
+      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
   arg->concat_arg.channel_num =
-      (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
+      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
   arg->concat_arg.image_out = out_ptr;
 
-  const int channel = (int)out->dims()[1];
-  int element_num_per_div = fpga::get_element_num_per_div(filter, group_num);
-  int element_num = fpga::get_aligned_filter_element_num(
+  auto channel = (int)out->dims()[1];  // NOLINT
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
+  int element_num = get_aligned_filter_element_num(
       filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
 
   for (int i = 0; i < n; i++) {
@@ -299,28 +395,30 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
     arg->conv_args[i].image.scale_address = input->scale;
     arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
     arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
-    arg->conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
-    arg->conv_args[i].filter_num =
-        (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
-                                    channel - (n - 1) * element_num_per_div)
-                              : element_num_per_div);
+    arg->conv_args[i].filter_scale_address = filter->scale;
+    arg->conv_args[i].filter_address = &(
+        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
+    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_args[i].filter_num = (uint32_t)(
+        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
+                   : filter_num_per_div);
 
     if (n > 1) {
       arg->conv_args[i].output.scale_address =
-          (float *)fpga::fpga_malloc(2 * sizeof(float));
-      arg->conv_args[i].output.address =
-          fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
-                            arg->conv_args[i].filter_num * sizeof(half));
-    }
-
-    else {
+          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
+      arg->conv_args[i].output.address = fpga_malloc(
+          input->dims()[2] *
+          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
+    } else {
       arg->conv_args[i].output.scale_address = out->scale;
       arg->conv_args[i].output.address = out_ptr;
     }
 
-    arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
-    arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address;
+    arg->concat_arg.images_in[i] =
+        (half *)arg->conv_args[i].output.address;  // NOLINT
+    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
     arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
   }
 }
diff --git a/src/fpga/api.h b/src/fpga/api.h
index 096f847170501784f0ee74b5a98ca91349587cfc..f5fa05b6750996ee391a30d2651a69d90e357547 100644
--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -20,28 +20,17 @@ limitations under the License. */
 #include <limits>
 #include "framework/tensor.h"
 
-// memory management;
-
 namespace paddle_mobile {
 namespace fpga {
 
-int open_device();
-int close_device();
-
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dst, const void* src, size_t num);
-
-enum DataConvertType {
-  DATA_NO_CONVERT = 0,
-  DATA_FP32_TO_FP16 = 1,
-  DATA_FP16_TO_FP32 = 2,
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
 };
 
-enum LayoutConvertType {
-  LAYOUT_NO_CONVERT = 0,
-  LAYOUT_CHW_TO_HWC = 1,
-  LAYOUT_HWC_TO_CHW = 2,
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
 };
 
 struct VersionArgs {
@@ -54,9 +43,6 @@ struct MemoryCopyArgs {
   size_t size;
 };
 
-/**
-Conv and Pooling kernel
-*/
 struct KernelArgs {
   uint32_t width;
   uint32_t height;
@@ -118,20 +104,21 @@ struct PoolingArgs {
   struct ImageOutputArgs output;
 };
 
-// elementwise add arguments
 struct EWAddArgs {
   bool relu_enabled;
 
-  float const0;  // output0 = const0 x input0 + const1 x input1;
-  float const1;
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
   struct ImageInputArgs image0;
   struct ImageInputArgs image1;
   struct ImageOutputArgs output;
 };
 
 struct BypassArgs {
-  enum DataConvertType convert_type;
-  enum LayoutConvertType layout_type;
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
   struct ImageInputArgs image;
   struct ImageOutputArgs output;
 };
@@ -141,6 +128,16 @@ struct FpgaRegWriteArgs {
   uint64_t value;
 };
 
+struct FpgaRegReadArgs {
+  uint64_t address;
+  uint64_t value;
+};
+
+struct MemoryCacheArgs {
+  void* address;
+  size_t size;
+};
+
 #define IOCTL_FPGA_MAGIC 'FPGA'
 
 #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
@@ -148,6 +145,8 @@ struct FpgaRegWriteArgs {
 #define IOCTL_SEPARATOR_0 10
 
 #define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
 
 #define IOCTL_SEPARATOR_1 20
 
@@ -184,6 +183,15 @@ enum FPGA_ERR_TYPE {
 
 //============================== API =============================
 
+int open_device();
+int close_device();
+
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dst, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+
 int PerformBypass(const struct BypassArgs& args);
 int ComputeFpgaConv(const struct WrapperConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
@@ -192,11 +200,13 @@ int ComputeFPGAConcat(const struct ConcatArgs& args);
 
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
 
+int get_align_image_cw(int cw);
 void format_image(framework::Tensor* image_tensor);
-void format_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
+void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
+void format_fp32_ofm(framework::Tensor* ofm_tensor);
 
 float filter_find_max(framework::Tensor* filter_tensor);
-int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num);
+int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
 int get_plit_num(framework::Tensor* filter_tensor);
 int get_aligned_filter_element_num(int chw);
 int get_aligned_filter_num(int num);
diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp
index a1b0c8577b9100f69f823a39e9e136c46b7e09ff..50f1ed03f0121b5afdc41d427e5b52675994bd1e 100644
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "bias_scale.h"
+#include "fpga/bias_scale.h"
 #include <memory.h>
-#include "api.h"
+#include "fpga/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -29,7 +29,8 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
       align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
   int num_element =
       2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float));
+  float *ptr_aligned =
+      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
 
   memset(ptr_aligned, 0, num_element * sizeof(float));
 
@@ -59,7 +60,7 @@ void interleave(float **data_in, int num_after_alignment) {
 
   float *ptr_uninterleaved = *data_in;
   float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));
+      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
   int num = num_after_alignment / 4;
   for (int i = 0; i < num; i++) {
     memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
@@ -79,6 +80,7 @@ void format_bias_scale_array(float **bias_scale_array,
   int element_num_after_division =
       align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
   interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
 }
 
 }  // namespace bias_scale
diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp
index 5f1a16d2339f3859f4cd85408c965d8d2634a55f..b4678c4cdcfe27b17bf819248f0c91061160d214 100644
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -11,9 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "filter.h"
+
+#include "fpga/filter.h"
 #include <memory.h>
-#include "api.h"
+#include <algorithm>
+#include "fpga/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -55,7 +57,7 @@ void convert_to_hwc(char **data_in, int num, int channel, int height,
                     int width) {
   char *tmp = *data_in;
   int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
   for (int n = 0; n < num; n++) {
     int64_t amount_per_row = width * channel;
     for (int c = 0; c < channel; c++) {
@@ -88,11 +90,11 @@ void quantize(float **data_in, int data_size, float max) {
   float fix_range = 127;
   float scale = fix_range / max;
 
-  char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char));
+  char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char));  // NOLINT
   for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = (char)((*data_in)[i] * scale);
+    tmp_data[i] = (char)((*data_in)[i] * scale);  // NOLINT
   }
-  *data_in = (float *)tmp_data;
+  *data_in = (float *)tmp_data;  // NOLINT
   fpga_free(tmp);
 }
 
@@ -101,9 +103,9 @@ void align_element(char **data_in, int num, int chw) {
   int j = 0;
   int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
   if (align_chw != chw) {
-    printf("align %d \n", align_chw);
     char *tmp = *data_in;
-    char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
+    char *data_tmp =
+        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
 
     memset(data_tmp, 0, num * align_chw);
     for (j = 0; j < num; j++) {
@@ -125,7 +127,7 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
     int div_num =
         (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
     int num_element = div_num * num_per_div_after_alignment * align_chw;
-    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));
+    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
 
     memset(data_tmp, 0, num_element * sizeof(char));
 
@@ -147,7 +149,8 @@ void reorder(char **data_in, int num_after_alignment, int chw) {
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
 
   char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
   char *tmp = *data_in;
   for (index = 0; index < num_after_alignment; index++) {
     new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
@@ -164,10 +167,11 @@ void interleave(char **data_in, int num_after_alignment, int chw) {
   int j = 0;
   int k = 0;
   int interleave_per_num = 16;
-  ;
+
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
   char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
   char *tmp = *data_in;
   int interleave_num = chw_align * 2 / interleave_per_num;
   for (i = 0; i < num_after_alignment; i += 2) {
@@ -200,13 +204,15 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
 
   quantize(data_in, data_size, max);
 
-  char **quantize_data = (char **)data_in;
+  char **quantize_data = (char **)data_in;  // NOLINT
 
   convert_to_hwc(quantize_data, num, channel, height, width);
   align_element(quantize_data, num, chw);
   align_num(quantize_data, num_per_div_before_alignment, num, chw);
   reorder(quantize_data, num_after_alignment, chw);
   interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
 }
 
 }  // namespace filter
diff --git a/src/fpga/image.cpp b/src/fpga/image.cpp
index 872abcd7c2dd6b16ab8ec8077e9afa6ec60c10d4..dac6e2a633155e593550ede4d738c5606cec3283 100644
--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "image.h"
+#include "fpga/image.h"
 #include <memory.h>
-#include "api.h"
+#include <algorithm>
+#include "fpga/api.h"
 
 namespace paddle_mobile {
 namespace fpga {
@@ -23,7 +24,7 @@ namespace image {
 void convert_to_hwc(float **data_in, int channel, int height, int width) {
   float *tmp = *data_in;
   float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
   int64_t amount_per_row = width * channel;
   for (int c = 0; c < channel; c++) {
     for (int h = 0; h < height; h++) {
@@ -38,17 +39,18 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) {
 }
 
 void align_element_conv(float **data_in, int height, int cw) {
-  int i = 0;
   int h = 0;
   int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
   if (align_cw != cw) {
     float *tmp = *data_in;
-    float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float));
+    float *data_tmp =
+        (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
 
     memset(data_tmp, 0, height * align_cw * sizeof(float));
 
     for (h = 0; h < height; h++) {
-      memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw),
+      memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
+             (void *)(*data_in + h * cw),        // NOLINT
              cw * sizeof(float));
     }
 
@@ -60,6 +62,8 @@ void align_element_conv(float **data_in, int height, int cw) {
 void format_image(float **data_in, int channel, int height, int width) {
   convert_to_hwc(data_in, channel, height, width);
   align_element_conv(data_in, height, channel * width);
+  fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
+                           sizeof(float));
 }
 
 void concat_images(int16_t **images_in, float **scales_in, void *image_out,
@@ -73,11 +77,17 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
   int align_each_in_area_cw = 0;
   int align_each_out_area_cw_differ = 0;
   int tmp_channel = 0;
-  *scale_out = 0;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
   for (i = 0; i < image_num; i++) {
     each_out_line_channel += channel_num[i];
-    *scale_out = std::max(*scale_out, scales_in[i][0]);
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+    fpga_invalidate(images_in[i],
+                    height *
+                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
+                        sizeof(int16_t));
   }
+  scale_out[1] = 1 / scale_out[0];
   align_each_out_area_cw =
       align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
   align_each_out_area_cw_differ =
@@ -88,7 +98,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
       for (i = 0; i < image_num; i++) {
         align_each_in_area_cw =
             align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int16_t *)image_out + tmp_channel +
+        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
                    k * align_each_out_area_cw_differ,
                images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
                channel_num[i] * sizeof(int16_t));
@@ -97,6 +107,8 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
       }
     }
   }
+
+  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
 }
 
 }  // namespace image
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 7c66f932df3df9793f116c8e62fea704e346b146..dd865fb27d4345f16ddca8005463986787d681be 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -62,13 +62,24 @@ void OperatorBase<Dtype>::Run() const {
   DLOG << "-------------" << type_ << "----------------------------";
   vector<string> input_keys = GetInputKeys();
   for (const auto key : input_keys) {
-    Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
-    if (input) DLOG << type_ << " input- " << key << "=" << *input;
+    auto var_vec_in = inputs_.at(key);
+    for (int i = 0; i < var_vec_in.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_in[i]);
+      if (vari->IsInitialized()) {
+        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+      }
+    }
   }
-  vector<string> output_keys = GetOutKeys();
-  for (const auto key : output_keys) {
-    Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
-    DLOG << type_ << " output- " << key << "=" << *out_;
+  for (const auto key : GetOutKeys()) {
+    auto var_vec_out = outputs_.at(key);
+    for (int i = 0; i < var_vec_out.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_out[i]);
+      if (vari->IsInitialized()) {
+        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+      }
+    }
   }
 #endif
 }
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index c5572dcbfdbd665994be7ebe005b6c9c98b5bca9..86dad5cdd216c1db604eeec04d48cb94cd2c336b 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -289,12 +289,8 @@ class Tensor {
     virtual std::type_index type() const { return type_; }
 
     virtual void set_type(std::type_index type) { type_ = type; }
-#ifndef PADDLE_MOBILE_FPGA
-    /*! the pointer of memory block. */
+
     std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
-#else
-    std::shared_ptr<uint8_t> ptr_;
-#endif
 
     /*! the size of memory block. */
     size_t size_;
@@ -323,10 +319,11 @@ class Tensor {
    * begins.
    */
   size_t offset_;
+
 #ifdef PADDLE_MOBILE_FPGA
- public:
+ public:  // NOLINT
   inline void reset_data_ptr(void *p) {
-    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);
+    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);  // NOLINT
   }
   float scale[2];  // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
 #endif
diff --git a/src/io/loader.cpp b/src/io/loader.cpp
index 1cef0ad2fdd6bc9f1e0351ed02778f3a1c322677..7dd55950be240a88a7521d4be260416625419015 100644
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -27,8 +27,8 @@ using framework::Variable;
  * @param scope
  */
 void InitMemoryFromProgram(
-    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,
-    std::shared_ptr<framework::Scope> &scope) {
+    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,  // NOLINT
+    std::shared_ptr<framework::Scope> &scope) {                  // NOLINT
   for (const auto &block : originProgramDesc.get()->Blocks()) {
     for (const auto &var_desc : block->Vars()) {
       auto var = scope.get()->Var(var_desc->Name());
@@ -45,7 +45,6 @@ void InitMemoryFromProgram(
           tensor->Resize(framework::make_ddim(dim));
         }
       } else {
-        // var_desc type is always lod tensor in any time?? (houjiang)
         // TODO(codeWorm): some.
       }
     }
@@ -62,7 +61,8 @@ void InitMemoryFromProgram(
  */
 template <typename Dtype, Precision P>
 void FusionAndPrintInfos(
-    bool &optimize, bool &can_add_split, framework::Program<Dtype, P> &program,
+    bool optimize, bool can_add_split,
+    framework::Program<Dtype, P> &program,  // NOLINT
     const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
   if (optimize) {
     framework::ProgramOptimize program_optimize;
@@ -78,6 +78,7 @@ void FusionAndPrintInfos(
     originProgramDesc->Description("program: ");
   }
 }
+
 static size_t ReadBuffer(const char *file_name, uint8_t **out) {
   FILE *fp;
   fp = fopen(file_name, "rb");
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 275e850caa2fb8da494cdfde5acf24b45e1b40ec..ed0213f15990c98d5868b77962c0f805283e5bdc 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -21,13 +21,11 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 #ifdef _OPENMP
   omp_set_num_threads(num);
 #endif
-};
+}
 
 template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::Load(const std::string &dirname,
-                                  bool optimize,
-                                  bool quantification,
-                                  bool loddable) {
+bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
+                                  bool quantification, bool loddable) {
   if (loader_.get() == nullptr) {
     loader_ = std::make_shared<Loader<Dtype, P>>();
   } else {
@@ -46,10 +44,8 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname,
 
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
-                                  const std::string &para_path,
-                                  bool optimize,
-                                  bool quantification,
-                                  bool loddable) {
+                                  const std::string &para_path, bool optimize,
+                                  bool quantification, bool loddable) {
   if (loader_.get() == nullptr) {
     loader_ = std::make_shared<Loader<Dtype, P>>();
   } else {
@@ -84,7 +80,8 @@ bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
     executor_ = std::make_shared<Executor<Dtype, P>>(
         loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                     combined_params_buf, optimise,
-                                    quantification), optimise);
+                                    quantification),
+        optimise);
   } else {
     LOG(kLOG_INFO) << "executor inited";
   }
@@ -122,6 +119,40 @@ PaddleMobile<Dtype, P>::~PaddleMobile() {
   loader_ = nullptr;
 }
 
+#ifdef PADDLE_MOBILE_FPGA
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
+                                            string var_name) {
+  executor_->InjectVariable(t, var_name);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
+  executor_->FeedData(t);
+}
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
+  return executor_->FetchResult(id);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
+  executor_->Predict_From_To(start, end);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_From(int start) {
+  executor_->Predict_From(start);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_To(int end) {
+  executor_->Predict_To(end);
+}
+#endif
+
 template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 8abd186752170326552454a4bd3478e34b0256f0..7ed23adbf9573fb8dd3d7f124bdb9ad85c7bbe26 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -34,15 +34,11 @@ class PaddleMobile {
 
  public:
   PaddleMobile() {}
-  bool Load(const std::string &dirname,
-            bool optimize = false,
-            bool quantification = false,
-            bool loddable = false);
+  bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, bool loddable = false);
 
-  bool Load(const std::string &model_path,
-            const std::string &para_path,
-            bool optimize = false,
-            bool quantification = false,
+  bool Load(const std::string &model_path, const std::string &para_path,
+            bool optimize = false, bool quantification = false,
             bool loddable = false);
 
   std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
@@ -61,6 +57,15 @@ class PaddleMobile {
 
   ~PaddleMobile();
 
+#ifdef PADDLE_MOBILE_FPGA
+  void InjectVariable(const framework::Tensor &t, string var_name);
+  void FeedData(const framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
+  void Predict_From_To(int start = 0, int end = -1);
+  void Predict_From(int start);
+  void Predict_To(int end);
+#endif
+
  private:
   std::shared_ptr<Loader<Dtype, P>> loader_;
   std::shared_ptr<Executor<Dtype, P>> executor_;
diff --git a/src/ios_io/PaddleMobileCPU.h b/src/ios_io/PaddleMobileCPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..c68d81f328f4ce9a9bf16624f677b2996644c35c
--- /dev/null
+++ b/src/ios_io/PaddleMobileCPU.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobileCPUResult: NSObject
+
+@property (assign, nonatomic, readonly) float *output;
+
+@property (assign, nonatomic, readonly) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface PaddleMobileCPU : NSObject
+
+/*
+    创建对象
+*/
+- (instancetype)init;
+
+/*
+    load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+  加载散开形式的模型, 需传入模型的目录
+*/
+- (BOOL)load:(NSString *)modelAndWeightPath;
+
+/*
+ * 从内存中加载模型
+ * */
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
+
+/*
+ *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
+ * */
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim;
+
+/*
+ * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ * */
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                                    dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+    进行预测, 默认 means 为 0, scale 为 1.0
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    清理内存
+*/
+- (void)clear;
+
+@end
diff --git a/src/ios_io/PaddleMobile.mm b/src/ios_io/PaddleMobileCPU.mm
similarity index 55%
rename from src/ios_io/PaddleMobile.mm
rename to src/ios_io/PaddleMobileCPU.mm
index 5c7b801be0ea7967ea0c94813325d41071bb890b..5a21418ef5fa9cbf7b24436cb778fc8c6c164e16 100644
--- a/src/ios_io/PaddleMobile.mm
+++ b/src/ios_io/PaddleMobileCPU.mm
@@ -12,24 +12,51 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-#import "PaddleMobile.h"
+#import "PaddleMobileCPU.h"
 
 #import "op_symbols.h"
+#include "framework/tensor.h"
 #import "io/paddle_mobile.h"
 
 #import <memory>
 #import <vector>
 
-@interface  PaddleMobile()
+
+@interface PaddleMobileCPUResult()
+
+-(void)toSetOutput:(float *)output;
+
+-(void)toSetOutputSize:(int)outputSize;
+
+@end
+
+@implementation PaddleMobileCPUResult
+
+-(void)releaseOutput {
+  delete [] _output;
+  _output = nil;
+  _outputSize = 0;
+}
+
+-(void)toSetOutput:(float *)output {
+  _output = output;
+}
+
+-(void)toSetOutputSize:(int)outputSize {
+  _outputSize = outputSize;
+}
+
+@end
+
+
+@interface  PaddleMobileCPU()
 {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
   BOOL loaded_;
-  std::vector<float> *predict_input_;
-
 }
 @end
 
-@implementation PaddleMobile
+@implementation PaddleMobileCPU
 
 static std::mutex shared_mutex;
 
@@ -66,6 +93,14 @@ static std::mutex shared_mutex;
   }
 }
 
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
+  pam_->SetThreadNum(2);
+  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, combinedParamsBuf);
+}
+
 - (BOOL)load:(NSString *)modelAndWeightPath{
   std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
   if (loaded_ = pam_->Load(model_path_str)) {
@@ -75,6 +110,57 @@ static std::mutex shared_mutex;
   }
 }
 
+
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  const int sourceRowBytes = CGImageGetBytesPerRow(image);
+  const int imageWidth = CGImageGetWidth(image);
+  const int imageHeight = CGImageGetHeight(image);
+  const int imageChannels = 4;
+  CGDataProviderRef provider = CGImageGetDataProvider(image);
+  CFDataRef cfData = CGDataProviderCopyData(provider);
+  const UInt8 *input = CFDataGetBytePtr(cfData);
+
+  int wanted_input_width = dim_vec[3];
+  int wanted_input_height = dim_vec[2];
+  int wanted_input_channels = dim_vec[1];
+
+  for (int c = 0; c < wanted_input_channels; ++c) {
+    float *out_channel = output + c * wanted_input_height * wanted_input_width;
+    for (int y = 0; y < wanted_input_height; ++y) {
+      float *out_row = out_channel + y * wanted_input_width;
+      for (int x = 0; x < wanted_input_width; ++x) {
+        int in_row = (y * imageHeight) / wanted_input_height;
+        int in_col = (x * imageWidth) / wanted_input_width;
+        const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
+        float *out_pos = out_row + x;
+        if (c == 0) {
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 1){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 2){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }
+      }
+    }
+  }
+
+}
+
 -(void)preprocess:(const UInt8 *)input output:(float *)output imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means scale:(float)scale dim:(std::vector<int64_t>)dim{
   if (means == nil) {
     means = @[@0, @0, @0];
@@ -105,27 +191,54 @@ static std::mutex shared_mutex;
   }
 }
 
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
-//  printf(" hi i am here");
-  if (predict_input_) {
-//    printf(" fukc -- ");
-//    printf(" %d \n", predict_input_->size());
-    // dim to c++ vector, get numel
-    std::vector<int64_t > dim_vec = {1, 3, 300, 300};
-//    int numel = 1;
-//    for (int k = 0; k < dim.count; ++k) {
-//      int d = dim[k].intValue;
-//      numel *= d;
-//      dim_vec.push_back(d);
-//    }
-
-
-    std::vector<float> cpp_result = pam_->Predict(*predict_input_, dim_vec);
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                      dim:(NSArray<NSNumber *> *)dim {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (!loaded_) {
+    printf("PaddleMobile doesn't be loaded yet");
+    return nil;
+  }
+
+  if (dim.count != 4) {
+    printf("dim must have 4 elements");
     return nil;
   }
-//  printf(" predict one ");
 
-//  std::lock_guard<std::mutex> lock(shared_mutex);
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  paddle_mobile::framework::Tensor input_tensor;
+
+  paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec);
+
+  float *input_ptr = input_tensor.mutable_data<float>(dims);
+
+  memcpy(input_ptr, input,
+         numel * sizeof(float));
+
+  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Predict(input_tensor);
+
+  float *output_pointer = new float[output->numel()];
+
+  memcpy(output_pointer, output->data<float>(),
+         output->numel() * sizeof(float));
+
+  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
+  [cpuResult toSetOutput: output_pointer];
+  [cpuResult toSetOutputSize: output->numel()];
+
+  return cpuResult;
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
+//  printf(" predict one ");
+  std::lock_guard<std::mutex> lock(shared_mutex);
   if (!loaded_) {
     printf("PaddleMobile doesn't be loaded yet");
     return nil;
@@ -164,15 +277,13 @@ static std::mutex shared_mutex;
   }
 
   // input
-  std::vector<float> *predict_input = new std::vector<float>();
+  std::vector<float> predict_input;
   for (int j = 0; j < numel; ++j) {
-    predict_input->push_back(dataPointer[j]);
+    predict_input.push_back(dataPointer[j]);
   }
 
-  predict_input_ = predict_input;
-
   // predict
-  std::vector<float> cpp_result = pam_->Predict(*predict_input, dim_vec);
+  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
 
   // result
   long count = 0;
diff --git a/src/ios_io/op_symbols.h b/src/ios_io/op_symbols.h
index 0fe1137278d19ab4c9c9aaecf2db108e4a184993..1334b609d14554f77ceaa35c5479455c77d8c665 100644
--- a/src/ios_io/op_symbols.h
+++ b/src/ios_io/op_symbols.h
@@ -15,27 +15,48 @@
 #pragma once
 
 #include "operators/batchnorm_op.h"
+#include "operators/bilinear_interp_op.h"
 #include "operators/box_coder_op.h"
 #include "operators/concat_op.h"
 #include "operators/conv_op.h"
+#include "operators/conv_transpose_op.h"
+#include "operators/crf_op.h"
 #include "operators/depthwise_conv_op.h"
+#include "operators/dequantize_op.h"
 #include "operators/dropout_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/feed_op.h"
 #include "operators/fetch_op.h"
-#include "operators/fusion_conv_add.h"
+#include "operators/flatten_op.h"
+#include "operators/fusion_conv_add_add_prelu_op.h"
+#include "operators/fusion_conv_add_bn_op.h"
 #include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/fusion_conv_add_op.h"
+#include "operators/fusion_conv_add_prelu_op.h"
+#include "operators/fusion_conv_add_relu_op.h"
+#include "operators/fusion_conv_bn_add_relu_op.h"
 #include "operators/fusion_conv_bn_relu_op.h"
 #include "operators/fusion_dwconv_bn_relu_op.h"
+#include "operators/fusion_elementwise_add_relu_op.h"
 #include "operators/fusion_fc_op.h"
+#include "operators/fusion_fc_relu_op.h"
+#include "operators/gru_op.h"
 #include "operators/im2sequence_op.h"
+#include "operators/lookup_op.h"
 #include "operators/lrn_op.h"
 #include "operators/mul_op.h"
 #include "operators/multiclass_nms_op.h"
 #include "operators/pool_op.h"
+#include "operators/prelu_op.h"
 #include "operators/prior_box_op.h"
+#include "operators/quantize_op.h"
 #include "operators/relu_op.h"
 #include "operators/reshape_op.h"
+#include "operators/resize_op.h"
+#include "operators/scale_op.h"
+#include "operators/shape_op.h"
 #include "operators/sigmoid_op.h"
+#include "operators/slice_op.h"
 #include "operators/softmax_op.h"
+#include "operators/split_op.h"
 #include "operators/transpose_op.h"
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 3d1b56ccc186b8e5699094b7f375c1cbc27edced..c7e77fcca40a3c533e442d10604c8cd9bcc1e74b 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -24,7 +24,7 @@ namespace operators {
 template <typename DeviceType, typename T>
 class FeedOp : public framework::OperatorBase<DeviceType> {
  public:
-  FeedOp(const string &type, const VariableNameMap &inputs,
+  FeedOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
          std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
@@ -38,25 +38,29 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
   }
 
 #ifdef PADDLE_MOBILE_FPGA
+
   void Init() {
     Tensor *output = param_.Out();
-    fpga::format_ofm(output);
+    fpga::format_fp16_ofm(output);
   }
 
   void RunImpl() const {
-    auto input = reinterpret_cast<Tensor *>(param_.InputX());
+    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());  // NOLINT
     fpga::format_image(input);
     auto input_ptr = input->data<float>();
     Tensor *output = param_.Out();
-    auto output_ptr = output->mutable_data<half>();
-
-    fpga::BypassArgs args;
-    args.convert_type = fpga::DATA_FP32_TO_FP16;
-    args.layout_type = fpga::LAYOUT_NO_CONVERT;
-    args.image.address = input_ptr;
-    args.image.channels = input->dims()[1];
-    args.image.height = input->dims()[2];
-    args.image.width = input->dims()[3];
+    auto output_ptr = output->data<float>();
+
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+
+    args.input_data_type = fpga::DATA_TYPE_FP32;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = (void *)input_ptr;  // NOLINT
+    args.image.channels = (uint32_t)input->dims()[1];
+    args.image.height = (uint32_t)input->dims()[2];
+    args.image.width = (uint32_t)input->dims()[3];
     args.image.pad_height = 0;
     args.image.pad_width = 0;
     args.output.address = output_ptr;
diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
index 643ee84529e01aebc33a144b4c7a8181ff39a1c9..d71bc235977236fbd0dd332df556ea4bd41eacf4 100644
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -129,10 +129,13 @@ void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
     //        param.Paddings(),
     //                               param.Filter(), param.Bias(),
     //                               param.Output(), false);
-
-    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
+    if (param.Paddings()[0] == 0) {
+      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
                                  *param.Bias(), true);
-
+    } else {
+      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
+                                   param.Output(), *param.Bias(), true);
+    }
   } else {
     ConvAddBasic(param);
   }
diff --git a/src/operators/kernel/central-arm-func/prior_box_arm_func.h b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
index 7129996319aac7c71836d8706eb5c02300e576e6..e783c52f8184d6e09b04cd5c8210f5b89276541e 100644
--- a/src/operators/kernel/central-arm-func/prior_box_arm_func.h
+++ b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <cmath>
 #include <vector>
 
 namespace paddle_mobile {
@@ -89,26 +90,8 @@ void PriorBoxCompute(const PriorBoxParam<CPU> &param) {
       int idx = 0;
       for (size_t s = 0; s < min_sizes.size(); ++s) {
         auto min_size = min_sizes[s];
-        // priors with different aspect ratios
-        for (float ar : aspect_ratios) {
-          box_width = min_size * sqrt(ar) / 2.;
-          box_height = min_size / sqrt(ar) / 2.;
-          /// box_width/2 , / img_width 为了得到feature map 相对于
-          /// 原图的归一化位置的比例。
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-        if (!max_sizes.empty()) {
-          auto max_size = max_sizes[s];
-          // square prior with size sqrt(minSize * maxSize)
-          box_width = box_height = sqrt(min_size * max_size) / 2.;
+        if (param.MinMaxAspectRatiosOrder()) {
+          box_width = box_height = min_size / 2.;
           output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
               (center_x - box_width) / img_width;
           output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
@@ -118,6 +101,73 @@ void PriorBoxCompute(const PriorBoxParam<CPU> &param) {
           output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
               (center_y + box_height) / img_height;
           idx++;
+
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
+
+          // priors with different aspect ratios
+          for (float ar : aspect_ratios) {
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            /// box_width/2 , / img_width 为了得到feature map 相对于
+            /// 原图的归一化位置的比例。
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
+
+        } else {
+          // priors with different aspect ratios
+          for (float ar : aspect_ratios) {
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            /// box_width/2 , / img_width 为了得到feature map 相对于
+            /// 原图的归一化位置的比例。
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
+          if (!max_sizes.empty()) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
         }
       }
     }
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
index ec7f04e3760f805cc51fd20c13913d13a286a96b..f61afd4a5c514ced87396313ea5d645fe830e12a 100644
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -24,10 +24,12 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
   auto inputs = param->Inputs();
   auto out = param->Out();
   auto image_num = inputs.size();
-  auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *));
-  auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *));
+  auto images_in =
+      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
+  auto scales_in =
+      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
   auto channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
 
   auto height = inputs[0]->dims()[2];
   auto width = inputs[0]->dims()[3];
@@ -36,22 +38,21 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
     PADDLE_MOBILE_ENFORCE(
         input->dims()[2] == height && input->dims()[3] == width,
         "Image height & width should be unified");
-    images_in[i] = (half *)input->data<float>();
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
+    images_in[i] = (half *)input->data<float>();      // NOLINT
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
     scales_in[i] = input->scale;
   }
-  fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
-                             channel_num);
+  fpga::format_concat_output(out, height, width, image_num, channel_num);
 
-  fpga::ConcatArgs concatArgs;
-  concatArgs.image_num = (uint32_t)image_num;
+  fpga::ConcatArgs concatArgs = {0};
+  concatArgs.image_num = image_num;
   concatArgs.images_in = images_in;
   concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->data<float>();
+  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
   concatArgs.scale_out = out->scale;
   concatArgs.channel_num = channel_num;
-  concatArgs.height = (uint32_t)height;
-  concatArgs.width = (uint32_t)width;
+  concatArgs.height = height;
+  concatArgs.width = width;
   param->SetFpgaArgs(concatArgs);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
index 84b9d6b0ddd9a1577ee37d095cabed2a8a2fe5a2..671df76967b4537d111695cdbe091b9c7de2c5a2 100644
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -23,7 +23,7 @@ template <>
 bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
   bool relu_enabled = false;
   auto input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
+
   auto bias = param->Bias();
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<Tensor *>(param->Filter());
@@ -62,11 +62,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
   fpga::format_filter(filter, max_value, param->Groups());
 
   int element_num_per_div =
-      fpga::get_element_num_per_div(filter, param->Groups());
+      fpga::get_filter_num_per_div(filter, param->Groups());
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_ofm(out);
+  fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
   fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0], param->Strides()[1],
                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
@@ -80,7 +80,6 @@ void ConvAddBNKernel<FPGA, float>::Compute(
     const FusionConvAddBNParam<FPGA> &param) const {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
-template class ConvAddBNKernel<FPGA, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
index e38ae9240534b17e97d7ee1c68bffb25a8aedf71..ea01245f1207739d4234ea3509451a2de1d321f4 100644
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -24,7 +24,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
     FusionConvAddBNReluParam<FPGA> *param) {
   bool relu_enabled = true;
   auto input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<Tensor *>(param->Filter());
@@ -39,7 +38,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                         "Output channel should be equal to bias number");
 
   const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   auto new_scale = new Tensor();
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
@@ -58,16 +58,14 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
 
   float max_value = fpga::filter_find_max(filter);
   fpga::format_filter(filter, max_value, param->Groups());
-  auto filter_ptr = filter->data<float>();
 
   int element_num_per_div =
-      fpga::get_element_num_per_div(filter, param->Groups());
+      fpga::get_filter_num_per_div(filter, param->Groups());
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
 
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<float>();
+  fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
   fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0], param->Strides()[1],
                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
@@ -80,7 +78,6 @@ void ConvAddBNReluKernel<FPGA, float>::Compute(
     const FusionConvAddBNReluParam<FPGA> &param) const {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
-template class ConvAddBNReluKernel<FPGA, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
index 31f28df5103942750758040ab983e2c0298a8cfd..928b73e4d30144cdf1128a018628b6208fcfd5f0 100644
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -23,7 +23,6 @@ template <>
 bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
   bool relu_enabled = true;
   auto input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
   const Tensor *bias = param->Bias();
   auto bias_ptr = bias->data<float>();
   auto filter = const_cast<Tensor *>(param->Filter());
@@ -32,7 +31,8 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                         "Output channel should be equal to bias number");
   int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
     bs_ptr[i + channel] = 1;
     bs_ptr[i] = bias_ptr[i];
@@ -40,16 +40,14 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
 
   float max_value = fpga::filter_find_max(filter);
   fpga::format_filter(filter, max_value, param->Groups());
-  auto filter_ptr = filter->data<float>();
 
   int element_num_per_div =
-      fpga::get_element_num_per_div(filter, param->Groups());
+      fpga::get_filter_num_per_div(filter, param->Groups());
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
 
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<float>();
+  fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
   fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0], param->Strides()[1],
                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
@@ -62,7 +60,6 @@ void ConvAddReluKernel<FPGA, float>::Compute(
     const FusionConvAddReluParam<FPGA> &param) const {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
-template class ConvAddReluKernel<FPGA, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp
index 8818e98c376ab4e33d399bdf429e5b01928672e2..fea211af74b634fc0dd8dcee1db7c2c004145561 100644
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP
 
 #include "operators/kernel/conv_bn_kernel.h"
-#include "fpga/api.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -24,7 +23,6 @@ template <>
 bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
   bool relu_enabled = false;
   auto input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
   auto filter = const_cast<Tensor *>(param->Filter());
   auto out = param->Output();
   auto bn_mean_ptr = param->InputMean()->data<float>();
@@ -34,10 +32,9 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
   const float epsilon = param->Epsilon();
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
                         "Output channel should be equal to bias number");
-
   const int channel = out->dims()[1];
   auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
   auto new_scale = new Tensor();
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
@@ -55,16 +52,14 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
 
   float max_value = fpga::filter_find_max(filter);
   fpga::format_filter(filter, max_value, param->Groups());
-  auto filter_ptr = filter->data<float>();
 
   int element_num_per_div =
-      fpga::get_element_num_per_div(filter, param->Groups());
+      fpga::get_filter_num_per_div(filter, param->Groups());
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
 
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<float>();
+  fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
   fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0], param->Strides()[1],
                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
@@ -77,7 +72,6 @@ void ConvBNKernel<FPGA, float>::Compute(
     const FusionConvBNParam<FPGA> &param) const {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
-template class ConvBNKernel<FPGA, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
index 8fe4425a23de2b4b16b241bf65d893d10132cc2e..87fe12664e75717c78d79ec50821a9bb6201c5a0 100644
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -23,7 +23,6 @@ template <>
 bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
   bool relu_enabled = true;
   auto input = const_cast<Tensor *>(param->Input());
-  auto input_ptr = input->data<float>();
   auto filter = const_cast<Tensor *>(param->Filter());
   auto out = param->Output();
   auto bn_mean_ptr = param->InputMean()->data<float>();
@@ -34,7 +33,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
                         "Output channel should be equal to bias number");
   const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   auto new_scale = new Tensor();
   auto new_bias = new Tensor();
   auto new_scale_ptr = new_scale->mutable_data<float>({channel});
@@ -52,29 +52,14 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
 
   float max_value = fpga::filter_find_max(filter);
   fpga::format_filter(filter, max_value, param->Groups());
-  auto filter_ptr = filter->data<float>();
 
   int element_num_per_div =
-      fpga::get_element_num_per_div(filter, param->Groups());
+      fpga::get_filter_num_per_div(filter, param->Groups());
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
 
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<float>();
+  fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs convArgs;
-  convArgs.group_num = (uint32_t)param->Groups();
-  convArgs.split_num = (uint32_t)fpga::get_plit_num(filter);
-  convArgs.filter_num = (uint32_t)filter->dims()[0];
-  convArgs.output.address = out_ptr;
-  convArgs.output.scale_address = out->scale;
-  convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
-      convArgs.split_num * sizeof(fpga::ConvArgs));
-  param->SetFpgaArgs(convArgs);
-
-  int element_num = fpga::get_aligned_filter_element_num(
-      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
-
-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
   fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
                       param->Groups(), param->Strides()[0], param->Strides()[1],
                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
@@ -87,7 +72,6 @@ void ConvBNReluKernel<FPGA, float>::Compute(
     const FusionConvBNReluParam<FPGA> &param) const {
   fpga::ComputeFpgaConv(param.FpgaArgs());
 }
-template class ConvBNReluKernel<FPGA, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/dropout_kernel.cpp b/src/operators/kernel/fpga/dropout_kernel.cpp
index 3a4dd216d481322a9228cfd247bf6f0d0098177e..b0981c4254060996a16f4ae5beabb7c22edd6d34 100644
--- a/src/operators/kernel/fpga/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
@@ -27,13 +27,7 @@ bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
 
 template <>
 void DropoutKernel<FPGA, float>::Compute(
-    const DropoutParam<FPGA> &param) const {
-  // auto *input_x = param.InputX();
-  // auto *out = param.Out();
-  // auto input_x_ptr = input_x->data<float>();
-  // auto out_ptr = out->mutable_data<float>();
-  // out_ptr = const_cast<float *>(input_x_ptr);
-}
+    const DropoutParam<FPGA> &param) const {}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
index 9840f495e89a3e63990bf5f10c65cf4afe8d0854..f0d8533641941fe43a6d06b49266ac06646a7b4d 100644
--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
@@ -27,10 +27,10 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
   auto *out = param->Out();
   auto input_x_ptr = input_x->data<float>();
   auto input_y_ptr = input_y->data<float>();
-  fpga::format_ofm(out);
+  fpga::format_fp16_ofm(out);
   auto out_ptr = out->mutable_data<float>();
 
-  fpga::EWAddArgs ewaddArgs;
+  fpga::EWAddArgs ewaddArgs = {0};
   ewaddArgs.relu_enabled = relu_enabled;
   ewaddArgs.const0 = 1;
   ewaddArgs.const1 = 1;
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp
index 48d7425fcb7a3c630165fe4a7d26875a4f4a0a9d..052607aae7f3211da211f8aaaff5bb75a36138ce 100644
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -21,7 +21,6 @@ template <>
 bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   bool relu_enabled = true;
   auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto input_x_ptr = input_x->data<float>();
   auto filter = const_cast<Tensor *>(param->InputY());
   auto input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
@@ -29,7 +28,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
                         "Image channel should be equal to weight number");
   int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
     bs_ptr[i + channel] = 1;
     bs_ptr[i] = input_z_ptr[i];
@@ -47,14 +47,12 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
   float max_value = fpga::filter_find_max(filter);
   fpga::format_filter(filter, max_value, 1);
-  auto filter_ptr = filter->data<float>();
 
-  int element_num_per_div = fpga::get_element_num_per_div(filter, 1);
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
 
-  auto out_ptr = out->mutable_data<float>();
-
-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
   fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
                       0, bs_ptr);
   param->SetFpgaArgs(conv_arg);
@@ -64,7 +62,7 @@ template <>
 void FusionFcReluKernel<FPGA, float>::Compute(
     const FusionFcReluParam<FPGA> &param) const {
   fpga::ComputeFpgaConv(param.FpgaArgs());
-};
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
index ccc6009700c98f1f94835a7e21a83de1faade1f0..6536f796ef2b27d33080c79cf36ac462604782be 100644
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -22,7 +22,6 @@ template <>
 bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   bool relu_enabled = false;
   auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto input_x_ptr = input_x->data<float>();
   auto filter = const_cast<Tensor *>(param->InputY());
   const Tensor *input_z = param->InputZ();
   auto input_z_ptr = input_z->data<float>();
@@ -31,7 +30,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
                         "Image channel should be equal to weight number");
   int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
   for (int i = 0; i < channel; i++) {
     bs_ptr[i + channel] = 1;
     bs_ptr[i] = input_z_ptr[i];
@@ -48,14 +48,12 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
   float max_value = fpga::filter_find_max(filter);
   fpga::format_filter(filter, max_value, 1);
-  auto filter_ptr = filter->data<float>();
 
-  int element_num_per_div = fpga::get_element_num_per_div(filter, 1);
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
 
-  auto out_ptr = out->mutable_data<float>();
-
-  fpga::WrapperConvArgs conv_arg;
+  fpga::WrapperConvArgs conv_arg = {0};
   fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
                       0, bs_ptr);
   param->SetFpgaArgs(conv_arg);
diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp
index d3df951dbc340814d766f76e8720c3aaef2f3539..4dad2f789baeb6e381c66ed861b8a8360fa2996e 100644
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -24,13 +24,13 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
   auto *input = const_cast<Tensor *>(param->Input());
   auto input_ptr = input->data<float>();
   Tensor *output = param->Output();
-  fpga::format_ofm(output);
+  fpga::format_fp16_ofm(output);
   auto output_ptr = output->mutable_data<float>();
   vector<int> ksize = param->Ksize();
   vector<int> strides = param->Strides();
   vector<int> paddings = param->Paddings();
 
-  fpga::PoolingArgs poolArgs;
+  fpga::PoolingArgs poolArgs = {0};
   poolArgs.image.address = input_ptr;
   poolArgs.image.channels = (uint32_t)input->dims()[1];
   poolArgs.image.height = (uint32_t)input->dims()[2];
@@ -39,7 +39,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
   poolArgs.image.pad_width = (uint32_t)paddings[1];
   poolArgs.image.scale_address = input->scale;
   poolArgs.output.address = output_ptr;
-  poolArgs.output.scale_address = input->scale;
+  poolArgs.output.scale_address = output->scale;
   poolArgs.kernel.height = (uint32_t)ksize[0];
   poolArgs.kernel.width = (uint32_t)ksize[1];
   poolArgs.kernel.stride_h = (uint32_t)strides[0];
@@ -50,9 +50,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
 
 template <>
 void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) const {
-#ifdef PADDLE_MOBILE_FPGA
   fpga::ComputeFpgaPool(param.FpgaArgs());
-#endif
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
index a7f7b0cf57f3c1498a6e9f36cb7196cf9f8b4ceb..dba555708f505eb9bdf81d6f4487227c88f0a616 100644
--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -43,17 +43,24 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
   args.output.scale_address = float_input->scale;
   param->SetFloatInput(float_input);
   param->SetFpgaArgs(args);
-
   return true;
 }
 
 template <>
 void SoftmaxKernel<FPGA, float>::Compute(
     const SoftmaxParam<FPGA> &param) const {
-  // SoftmaxCompute<float>(param);
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate(
+      (void *)in_x->data<float>(),  // NOLINT
+      fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
+
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
 }
 
-template class SoftmaxKernel<FPGA, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index 402b187f8f5e9d2fbb70fa6bcfb72c88aa53e3d3..91e11fa8ff0184e5321269167b5f4693de2245ac 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                      Tensor *output, const Tensor *new_scale,
                                      const Tensor *new_bias, bool if_relu) {
 #if __ARM_NEON
-  //#ifdef _OPENMP
+  // #ifdef _OPENMP
   //  const float *newscale_data = new_scale->data<float>();
   //  const float *newbias_data = new_bias->data<float>();
   //
@@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
   //    }
   //  }
   //
-  //#else
+  // #else
 
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
@@ -1877,7 +1877,104 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
     input_data += inhxw * c;
     output_data += outhxw * c;
   }
-//#endif
+// #endif
+#endif
+}
+
+void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor bias, bool if_bias) {
+#if __ARM_NEON
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int input_channel = static_cast<int>(input->dims()[1]);
+
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
+  const int inhxw = input_height * input_width;
+  const int outhxw = output_height * output_width;
+
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; b++) {
+#pragma omp parallel for
+    for (int c = 0; c < input_channel; c++) {
+      const float *filter_data = filter->data<float>() + c * 9;
+      const float *input_data = input->data<float>() + c * inhxw;
+      const float *bias_data = bias.data<float>() + c;
+      float *output_data = output->data<float>() + c * outhxw;
+      float w00 = filter_data[0];
+      float w01 = filter_data[1];
+      float w02 = filter_data[2];
+      float w10 = filter_data[3];
+      float w11 = filter_data[4];
+      float w12 = filter_data[5];
+      float w20 = filter_data[6];
+      float w21 = filter_data[7];
+      float w22 = filter_data[8];
+
+      float32x4_t biasv = vld1q_dup_f32(bias_data);
+
+      for (int i = 0; i < output_height; i += 1) {
+        for (int m = 0; m < output_width - 2; m += 3) {
+          float *output_ptr = output_data + i * output_width + m;
+          float32x4x2_t input_buff_top{}, input_buff_mid{}, input_buff_bottom{};
+          float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3,
+              tmp4, tmp5, out0;
+          input_buff_top =
+              vld2q_f32(input_data + (2 * i) * input_width + (2 * m));
+          input_buff_mid =
+              vld2q_f32(input_data + (2 * i + 1) * input_width + (2 * m));
+          input_buff_bottom =
+              vld2q_f32(input_data + (2 * i + 2) * input_width + (2 * m));
+
+          in0 = input_buff_top.val[0];
+          tmp0 = input_buff_top.val[1];
+          tmp1 = vextq_f32(in0, zero, 1);
+
+          in2 = input_buff_mid.val[0];
+          tmp2 = input_buff_mid.val[1];
+          tmp3 = vextq_f32(in2, zero, 1);
+
+          in4 = input_buff_bottom.val[0];
+          tmp4 = input_buff_bottom.val[1];
+          tmp5 = vextq_f32(in4, zero, 1);
+
+          out0 = vmulq_n_f32(in0, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, biasv);
+
+          vst1q_lane_f32(output_ptr, out0, 0);
+          vst1q_lane_f32(output_ptr + 1, out0, 1);
+          vst1q_lane_f32(output_ptr + 2, out0, 2);
+        }
+        int m;
+        for (m = 0; m < output_width - 2; m += 3) {
+        }
+        for (int j = m; j < output_width; j++) {
+          output_data[i * output_width + j] =
+              input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
+              input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
+              input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
+              input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
+              input_data[(2 * i) * input_width + 2 * j] * w11 +
+              input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
+              input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
+              input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
+          output_data[i * output_width + j] += *bias_data;
+        }
+      }
+    }
+  }
+
 #endif
 }
 
diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h
index 60e979648f871e640924a3373c625c311c3dd067..b146b88e737a07ea08250315fc94653f63d2ad05 100644
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
@@ -43,6 +43,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                      Tensor *output, const Tensor *new_scale,
                                      const Tensor *new_bias, bool if_relu);
+
+void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor bias, bool if_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index 4065f7d9c4934bce8285ea99fe4f14c4e2cc990c..090ccdf24e214fc86b8a4032df228d50caa65ef9 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -74,7 +74,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
     const int isize = im_height;
     bool pad1 = padding[0] > 0;
     bool pad2 =
-        (pad1 &&
+        (pad1 && padding[1] &&
          (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
     int fill = isize % 2;
     if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
diff --git a/src/operators/math/math_func_neon.h b/src/operators/math/math_func_neon.h
index 97e1e6f67d57ec1ad9ea294aa227f5f781e2e273..5bb3fd0f5ae3f6349ab52535348f6310e4096951 100644
--- a/src/operators/math/math_func_neon.h
+++ b/src/operators/math/math_func_neon.h
@@ -38,7 +38,6 @@ limitations under the License. */
  *
  *  (this is the zlib license)
  */
-
 #pragma once
 #include <arm_neon.h>
 
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index 6ef9fb2a8252e82014ebebc22f82066eeb324c0d..76edcab9b4d29c7d80abcf64dff3a873321f9e54 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "operators/math/math_function.h"
 #include <cstring>
+#include <string>
 #include "operators/math/gemm.h"
 
 namespace paddle_mobile {
@@ -36,13 +37,35 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
   int N = dim_out[1];
   int K = (!trans_a) ? dim_a[1] : dim_a[0];
 
+  if (trans_a) {
+    int numel = matrix_a.numel();
+    int m = matrix_a.dims()[0];
+    int n = matrix_a.dims()[1];
+    float *tmp = (float *)(matrix_a.data<float>());  // NOLINT
+    float *a = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * numel));
+    int index = 0;
+    for (int j = 0; j < n; j++) {
+      for (int i = 0; i < m; i++) {
+        a[index++] = tmp[i * n + j];
+      }
+    }
+#ifdef _OPENMP
+    Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+              matrix_out->data<float>(), N, relu, bias);
+#else
+    Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+          matrix_out->data<float>(), N, relu, bias);
+#endif
+  } else {
 #ifdef _OPENMP
-  Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
-            N, beta, matrix_out->data<float>(), N, relu, bias);
+    Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu, bias);
 #else
-  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-        beta, matrix_out->data<float>(), N, relu, bias);
+    Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+          beta, matrix_out->data<float>(), N, relu, bias);
 #endif
+  }
 }
 
 template <>
@@ -104,7 +127,7 @@ struct ClearTensor<CPU, T> {
   void operator()(framework::Tensor *tensor) {
     auto size = tensor->numel();
     auto *tensor_data = tensor->data<float>();
-    memset((void *)tensor_data, 0, sizeof(T) * size);
+    memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
   }
 };
 
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index 05d3017f635a040a52d2cc377c8f384dbbd8086c..dadb5a67cf6dda531b15783feafe5cee370e109a 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -31,186 +31,43 @@ using std::min;
 using std::vector;
 void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
 #if __ARM_NEON
-  const int batch_size = input->dims()[0];
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int input_channel = static_cast<int>(input->dims()[1]);
 
-  const int h_in = input->dims()[2];
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
 
-  const int w_in = input->dims()[3];
-
-  const int output_channels = output->dims()[1];
+  const int hxw = input_height * input_width;
 
-  const int h_out = output->dims()[2];
-  const int w_out = output->dims()[3];
-  const int outputdata_channel_stride = h_out * w_out;
-  const int inputdata_channel_stride = h_in * w_in;
-  const int input_batch_stride = output_channels * inputdata_channel_stride;
-  const int output_batch_stride = output_channels * outputdata_channel_stride;
-  float *out_data = output->data<float>();
-  const float *input_data = input->data<float>();
+  const int l = input_height;
 
   const float coef = 1.0 / 9.0;
-  for (int k = 0; k < batch_size; ++k) {
-#pragma omp parallel for
-    for (int c = 0; c < output_channels; ++c) {
-      const float *input_seg = input_data + c * inputdata_channel_stride;
-      float *output_seg = out_data + c * outputdata_channel_stride;
-      // four corner point
-      output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
-                       input_seg[w_in + 1]) *
-                      coef;
-      output_seg[w_out - 1] =
-          (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - 2] +
-           input_seg[2 * w_in - 1]) *
-          coef;
-      output_seg[(h_out - 1) * w_out] =
-          (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
-           input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) *
-          coef;
-      output_seg[h_out * w_out - 1] =
-          (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
-           input_seg[(h_in - 1) * w_in - 1] +
-           input_seg[(h_in - 1) * w_in - 2]) *
-          coef;
-      // left side & right side
-      for (int i = 1; i < h_in - 1; ++i) {
-        output_seg[i * w_out] =
-            (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
-             input_seg[i * w_in] + input_seg[i * w_in + 1] +
-             input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
-            coef;
-        output_seg[i * w_out + w_out - 1] =
-            (input_seg[i * w_in - w_in + w_in - 2] +
-             input_seg[i * w_in - w_in + 1 + w_in - 2] +
-             input_seg[i * w_in + w_in - 2] +
-             input_seg[i * w_in + 1 + w_in - 2] +
-             input_seg[i * w_in + w_in + w_in - 2] +
-             input_seg[i * w_in + w_in + 1 + w_in - 2]) *
-            coef;
-      }
-      // top 1 row & bottom 1 row
-      const float *input_tmp = input_seg;
-
-      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
-          tmp3, tmp4, tmp5, sum, out0;
-      float32x4_t v_coef = vdupq_n_f32(coef);
-      in0 = vld1q_f32(input_tmp);
-      in2 = vld1q_f32(input_tmp + w_in);
-      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
-      in4 = vld1q_f32(input_tmp_end);
-      in6 = vld1q_f32(input_tmp_end + w_in);
-      int c_mid = w_out - 2;
-      auto output_ptr = output_seg + 1;
-      for (; c_mid > 3; c_mid -= 4) {
-        in1 = vld1q_f32(input_tmp + 4);
-        in3 = vld1q_f32(input_tmp + w_in + 4);
-
-        tmp0 = vextq_f32(in0, in1, 1);
-        tmp1 = vextq_f32(in0, in1, 2);
-
-        tmp2 = vextq_f32(in2, in3, 1);
-        tmp3 = vextq_f32(in2, in3, 2);
-
-        sum = vaddq_f32(in0, tmp0);
-        sum = vaddq_f32(sum, tmp1);
-        sum = vaddq_f32(sum, in2);
-        sum = vaddq_f32(sum, tmp2);
-        sum = vaddq_f32(sum, tmp3);
-
-        vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
-
-        in5 = vld1q_f32(input_tmp_end + 4);
-        in7 = vld1q_f32(input_tmp_end + w_in + 4);
-
-        tmp0 = vextq_f32(in4, in5, 1);
-        tmp1 = vextq_f32(in4, in5, 2);
-        tmp2 = vextq_f32(in6, in7, 1);
-        tmp3 = vextq_f32(in6, in7, 2);
-
-        sum = vaddq_f32(in0, tmp0);
-        sum = vaddq_f32(sum, tmp1);
-        sum = vaddq_f32(sum, in2);
-        sum = vaddq_f32(sum, tmp2);
-        sum = vaddq_f32(sum, tmp3);
-
-        vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
-
-        // can optimize to each 8 stride.
-        input_tmp += 4;
-        input_tmp_end += 4;
-        output_ptr += 4;
-        in0 = in1;
-        in2 = in3;
-        in4 = in5;
-        in6 = in7;
-      }
-      // top right remain
-      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
-      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
-
-      tmp0 = vextq_f32(in0, pad0, 1);
-      tmp1 = vextq_f32(in0, pad0, 2);
-      tmp2 = vextq_f32(in2, pad1, 2);
-      tmp3 = vextq_f32(in2, pad1, 2);
-
-      sum = vaddq_f32(in0, tmp0);
-      sum = vaddq_f32(sum, tmp1);
-      sum = vaddq_f32(sum, in2);
-      sum = vaddq_f32(sum, tmp2);
-      sum = vaddq_f32(sum, tmp3);
-      out0 = vmulq_f32(sum, v_coef);
-
-      for (int i = 0; i < c_mid; ++i) {
-        if (i == 0) {
-          vst1q_lane_f32(output_ptr + i, out0, 0);
-        }
-        if (i == 1) {
-          vst1q_lane_f32(output_ptr + i, out0, 1);
-        }
-        if (i == 2) {
-          vst1q_lane_f32(output_ptr + i, out0, 2);
-        }
-      }
-
-      // bottom_right remain
-      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
-      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
-
-      tmp0 = vextq_f32(in4, pad2, 1);
-      tmp1 = vextq_f32(in4, pad2, 2);
-      tmp2 = vextq_f32(in6, pad3, 2);
-      tmp3 = vextq_f32(in6, pad3, 2);
-
-      sum = vaddq_f32(in4, tmp0);
-      sum = vaddq_f32(sum, tmp1);
-      sum = vaddq_f32(sum, in6);
-      sum = vaddq_f32(sum, tmp2);
-      sum = vaddq_f32(sum, tmp3);
-      out0 = vmulq_f32(sum, v_coef);
+  const float coef1 = 1.0 / 6.0;
+  const float coef2 = 1.0 / 4.0;
 
-      for (int i = 0; i < c_mid; ++i) {
-        if (i == 0) {
-          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
-        }
-        if (i == 1) {
-          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
-        }
-        if (i == 2) {
-          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
-        }
-      }
-      // mid
-      for (int j = 0; j < h_out - 2; ++j) {
-        output_ptr = output_seg + w_out * (j + 1) + 1;
-        input_tmp = input_seg + j * w_in;
+  float32x4_t v_coef = vdupq_n_f32(coef);
+  float32x4_t v_coef1 = vdupq_n_f32(coef1);
 
-        in0 = vld1q_f32(input_tmp);
-        in2 = vld1q_f32(input_tmp + w_in);
-        in4 = vld1q_f32(input_tmp + 2 * w_in);
-        c_mid = w_out - 2;
-        for (; c_mid > 3; c_mid -= 4) {
-          in1 = vld1q_f32(input_tmp + 4);
-          in3 = vld1q_f32(input_tmp + w_in + 4);
-          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+  for (int b = 0; b < batch_size; b++) {
+#pragma omp parallel for
+    for (int c = 0; c < input_channel; c++) {
+      const float *input_data = input->data<float>() + c * hxw;
+      float *output_data = output->data<float>() + c * hxw;
+
+      for (int i = 1; i < output_height - 1; i++) {
+        float *output_ptr;
+        float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, tmp4,
+            tmp5, out0;
+        for (int m = 1; m < output_width - 4; m += 4) {
+          output_ptr = output_data + i * output_width + m;
+          in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1);
+          in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3);
+          in2 = vld1q_f32(input_data + i * input_width + m - 1);
+          in3 = vld1q_f32(input_data + i * input_width + m + 3);
+          in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1);
+          in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3);
 
           tmp0 = vextq_f32(in0, in1, 1);
           tmp1 = vextq_f32(in0, in1, 2);
@@ -219,63 +76,383 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
           tmp4 = vextq_f32(in4, in5, 1);
           tmp5 = vextq_f32(in4, in5, 2);
 
-          sum = vaddq_f32(in0, tmp0);
-          sum = vaddq_f32(sum, tmp1);
-          sum = vaddq_f32(sum, in2);
-          sum = vaddq_f32(sum, tmp2);
-          sum = vaddq_f32(sum, tmp3);
-          sum = vaddq_f32(sum, in4);
-          sum = vaddq_f32(sum, tmp4);
-          sum = vaddq_f32(sum, tmp5);
-
-          out0 = vmulq_f32(sum, v_coef);
-          vst1q_f32(output_ptr, out0);
-          output_ptr += 4;
-          input_tmp += 4;
-          in0 = in1;
-          in2 = in3;
-          in4 = in5;
+          out0 = in0;
+          out0 = vaddq_f32(out0, tmp0);
+          out0 = vaddq_f32(out0, tmp1);
+          out0 = vaddq_f32(out0, in2);
+          out0 = vaddq_f32(out0, tmp2);
+          out0 = vaddq_f32(out0, tmp3);
+          out0 = vaddq_f32(out0, in4);
+          out0 = vaddq_f32(out0, tmp4);
+          out0 = vaddq_f32(out0, tmp5);
+
+          vst1q_f32(output_ptr, vmulq_f32(out0, v_coef));
+        }
+        int m;
+        for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
         }
-        // mid remain
-        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
-        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
-        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
 
-        tmp0 = vextq_f32(in0, pad0, 1);
-        tmp1 = vextq_f32(in0, pad0, 2);
-        tmp2 = vextq_f32(in2, pad1, 1);
-        tmp3 = vextq_f32(in2, pad1, 2);
-        tmp4 = vextq_f32(in4, pad2, 1);
-        tmp5 = vextq_f32(in4, pad2, 2);
+        for (int j = m; j < output_width - 1; j++) {
+          output_data[i * output_width + j] =
+              input_data[(i - 1) * input_width + j - 1] +
+              input_data[(i - 1) * input_width + j] +
+              input_data[(i - 1) * input_width + j + 1] +
+              input_data[(i)*input_width + j - 1] +
+              input_data[(i)*input_width + j] +
+              input_data[(i)*input_width + j + 1] +
+              input_data[(i + 1) * input_width + j - 1] +
+              input_data[(i + 1) * input_width + j] +
+              input_data[(i + 1) * input_width + j + 1];
+          output_data[i * output_width + j] =
+              output_data[i * output_width + j] * coef;
+        }
+      }
 
-        sum = vaddq_f32(in0, tmp0);
-        sum = vaddq_f32(sum, tmp1);
-        sum = vaddq_f32(sum, in2);
-        sum = vaddq_f32(sum, tmp2);
-        sum = vaddq_f32(sum, tmp3);
-        sum = vaddq_f32(sum, in4);
-        sum = vaddq_f32(sum, tmp4);
-        sum = vaddq_f32(sum, tmp5);
-        out0 = vmulq_f32(sum, v_coef);
+      output_data[0] =
+          input_data[0] + input_data[1] + input_data[l] + input_data[l + 1];
+      output_data[l - 1] = input_data[l - 2] + input_data[l - 1] +
+                           input_data[2 * l - 2] + input_data[2 * l - 1];
+      output_data[(l - 1) * l] =
+          input_data[(l - 2) * l] + input_data[(l - 2) * l + 1] +
+          input_data[(l - 1) * l] + input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = input_data[(l - 2) * (l + 1)] +
+                               input_data[(l - 2) * (l + 1) + 1] +
+                               input_data[l * l - 2] + input_data[l * l - 1];
+      output_data[0] = output_data[0] * coef2;
+      output_data[l - 1] = output_data[l - 1] * coef2;
+      output_data[(l - 1) * l] = output_data[(l - 1) * l] * coef2;
+      output_data[l * l - 1] = output_data[l * l - 1] * coef2;
+
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] = input_data[i * l - l] + input_data[i * l - l + 1] +
+                             input_data[i * l] + input_data[i * l + 1] +
+                             input_data[i * l + l] + input_data[i * l + l + 1];
+
+        output_data[i * l + l - 1] =
+            input_data[i * l + l - 1 - l - 1] + input_data[i * l + l - 1 - l] +
+            input_data[i * l + l - 1 - 1] + input_data[i * l + l - 1] +
+            input_data[i * l + l - 1 + l - 1] + input_data[i * l + l - 1 + l];
+        output_data[i * l] = output_data[i * l] * coef1;
+        output_data[i * l + l - 1] = output_data[i * l + l - 1] * coef1;
+      }
 
-        for (int i = 0; i < c_mid; ++i) {
-          if (i == 0) {
-            vst1q_lane_f32(output_ptr + i, out0, 0);
-          }
-          if (i == 1) {
-            vst1q_lane_f32(output_ptr + i, out0, 1);
-          }
-          if (i == 2) {
-            vst1q_lane_f32(output_ptr + i, out0, 2);
-          }
-        }
+      int m;
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr = output_data + m;
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + m - 1);
+        in1 = vld1q_f32(input_data + m + 3);
+        in2 = vld1q_f32(input_data + input_width + m - 1);
+        in3 = vld1q_f32(input_data + input_width + m + 3);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = in0;
+        out0 = vaddq_f32(out0, tmp0);
+        out0 = vaddq_f32(out0, tmp1);
+        out0 = vaddq_f32(out0, in2);
+        out0 = vaddq_f32(out0, tmp2);
+        out0 = vaddq_f32(out0, tmp3);
+
+        vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
+      }
+
+      for (m = 1; (m + 3) < output_width - 1; m += 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[j] = input_data[j - 1] + input_data[j] + input_data[j + 1] +
+                         input_data[input_width + j - 1] +
+                         input_data[input_width + j] +
+                         input_data[input_width + j + 1];
+        output_data[j] = output_data[j] * coef1;
+      }
+
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr =
+            output_data + (output_height - 1) * output_width + m;
+
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1);
+        in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3);
+        in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1);
+        in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = in0;
+        out0 = vaddq_f32(out0, tmp0);
+        out0 = vaddq_f32(out0, tmp1);
+        out0 = vaddq_f32(out0, in2);
+        out0 = vaddq_f32(out0, tmp2);
+        out0 = vaddq_f32(out0, tmp3);
+
+        vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
+      }
+      for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[(output_height - 1) * input_width + j] =
+            input_data[(output_height - 2) * input_width + j - 1] +
+            input_data[(output_height - 2) * input_width + j] +
+            input_data[(output_height - 2) * input_width + j + 1] +
+            input_data[(output_height - 1) * input_width + j - 1] +
+            input_data[(output_height - 1) * input_width + j] +
+            input_data[(output_height - 1) * input_width + j + 1];
+        output_data[(output_height - 1) * output_width + j] =
+            output_data[(output_height - 1) * output_width + j] * coef1;
       }
-      //      input_data += inputdata_channel_stride;
-      //      out_data += outputdata_channel_stride;
     }
-    input_data += input_batch_stride;
-    out_data += output_batch_stride;
   }
+
+//  const int batch_size = input->dims()[0];
+//
+//  const int h_in = input->dims()[2];
+//
+//  const int w_in = input->dims()[3];
+//
+//  const int output_channels = output->dims()[1];
+//
+//  const int h_out = output->dims()[2];
+//  const int w_out = output->dims()[3];
+//  const int outputdata_channel_stride = h_out * w_out;
+//  const int inputdata_channel_stride = h_in * w_in;
+//  const int input_batch_stride = output_channels * inputdata_channel_stride;
+//  const int output_batch_stride = output_channels *
+//  outputdata_channel_stride; float *out_data = output->data<float>(); const
+//  float *input_data = input->data<float>();
+//
+//  const float coef = 1.0 / 9.0;
+//  for (int k = 0; k < batch_size; ++k) {
+// #pragma omp parallel for
+//    for (int c = 0; c < output_channels; ++c) {
+//      const float *input_seg = input_data + c * inputdata_channel_stride;
+//      float *output_seg = out_data + c * outputdata_channel_stride;
+//      // four corner point
+//      output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
+//                       input_seg[w_in + 1]) *
+//                      coef;
+//      output_seg[w_out - 1] =
+//          (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 -
+//          2] +
+//           input_seg[2 * w_in - 1]) *
+//          coef;
+//      output_seg[(h_out - 1) * w_out] =
+//          (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
+//           input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1])
+//           *
+//          coef;
+//      output_seg[h_out * w_out - 1] =
+//          (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
+//           input_seg[(h_in - 1) * w_in - 1] +
+//           input_seg[(h_in - 1) * w_in - 2]) *
+//          coef;
+//      // left side & right side
+//      for (int i = 1; i < h_in - 1; ++i) {
+//        output_seg[i * w_out] =
+//            (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
+//             input_seg[i * w_in] + input_seg[i * w_in + 1] +
+//             input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
+//            coef;
+//        output_seg[i * w_out + w_out - 1] =
+//            (input_seg[i * w_in - w_in + w_in - 2] +
+//             input_seg[i * w_in - w_in + 1 + w_in - 2] +
+//             input_seg[i * w_in + w_in - 2] +
+//             input_seg[i * w_in + 1 + w_in - 2] +
+//             input_seg[i * w_in + w_in + w_in - 2] +
+//             input_seg[i * w_in + w_in + 1 + w_in - 2]) *
+//            coef;
+//      }
+//      // top 1 row & bottom 1 row
+//      const float *input_tmp = input_seg;
+//
+//      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+//          tmp3, tmp4, tmp5, sum, out0;
+//      float32x4_t v_coef = vdupq_n_f32(coef);
+//      in0 = vld1q_f32(input_tmp);
+//      in2 = vld1q_f32(input_tmp + w_in);
+//      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
+//      in4 = vld1q_f32(input_tmp_end);
+//      in6 = vld1q_f32(input_tmp_end + w_in);
+//      int c_mid = w_out - 2;
+//      auto output_ptr = output_seg + 1;
+//      for (; c_mid > 3; c_mid -= 4) {
+//        in1 = vld1q_f32(input_tmp + 4);
+//        in3 = vld1q_f32(input_tmp + w_in + 4);
+//
+//        tmp0 = vextq_f32(in0, in1, 1);
+//        tmp1 = vextq_f32(in0, in1, 2);
+//
+//        tmp2 = vextq_f32(in2, in3, 1);
+//        tmp3 = vextq_f32(in2, in3, 2);
+//
+//        sum = vaddq_f32(in0, tmp0);
+//        sum = vaddq_f32(sum, tmp1);
+//        sum = vaddq_f32(sum, in2);
+//        sum = vaddq_f32(sum, tmp2);
+//        sum = vaddq_f32(sum, tmp3);
+//
+//        vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
+//
+//        in5 = vld1q_f32(input_tmp_end + 4);
+//        in7 = vld1q_f32(input_tmp_end + w_in + 4);
+//
+//        tmp0 = vextq_f32(in4, in5, 1);
+//        tmp1 = vextq_f32(in4, in5, 2);
+//        tmp2 = vextq_f32(in6, in7, 1);
+//        tmp3 = vextq_f32(in6, in7, 2);
+//
+//        sum = vaddq_f32(in0, tmp0);
+//        sum = vaddq_f32(sum, tmp1);
+//        sum = vaddq_f32(sum, in2);
+//        sum = vaddq_f32(sum, tmp2);
+//        sum = vaddq_f32(sum, tmp3);
+//
+//        vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
+//
+//        // can optimize to each 8 stride.
+//        input_tmp += 4;
+//        input_tmp_end += 4;
+//        output_ptr += 4;
+//        in0 = in1;
+//        in2 = in3;
+//        in4 = in5;
+//        in6 = in7;
+//      }
+//      // top right remain
+//      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
+//      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
+//
+//      tmp0 = vextq_f32(in0, pad0, 1);
+//      tmp1 = vextq_f32(in0, pad0, 2);
+//      tmp2 = vextq_f32(in2, pad1, 2);
+//      tmp3 = vextq_f32(in2, pad1, 2);
+//
+//      sum = vaddq_f32(in0, tmp0);
+//      sum = vaddq_f32(sum, tmp1);
+//      sum = vaddq_f32(sum, in2);
+//      sum = vaddq_f32(sum, tmp2);
+//      sum = vaddq_f32(sum, tmp3);
+//      out0 = vmulq_f32(sum, v_coef);
+//
+//      for (int i = 0; i < c_mid; ++i) {
+//        if (i == 0) {
+//          vst1q_lane_f32(output_ptr + i, out0, 0);
+//        }
+//        if (i == 1) {
+//          vst1q_lane_f32(output_ptr + i, out0, 1);
+//        }
+//        if (i == 2) {
+//          vst1q_lane_f32(output_ptr + i, out0, 2);
+//        }
+//      }
+//
+//      // bottom_right remain
+//      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
+//      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
+//
+//      tmp0 = vextq_f32(in4, pad2, 1);
+//      tmp1 = vextq_f32(in4, pad2, 2);
+//      tmp2 = vextq_f32(in6, pad3, 2);
+//      tmp3 = vextq_f32(in6, pad3, 2);
+//
+//      sum = vaddq_f32(in4, tmp0);
+//      sum = vaddq_f32(sum, tmp1);
+//      sum = vaddq_f32(sum, in6);
+//      sum = vaddq_f32(sum, tmp2);
+//      sum = vaddq_f32(sum, tmp3);
+//      out0 = vmulq_f32(sum, v_coef);
+//
+//      for (int i = 0; i < c_mid; ++i) {
+//        if (i == 0) {
+//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
+//        }
+//        if (i == 1) {
+//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
+//        }
+//        if (i == 2) {
+//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
+//        }
+//      }
+//      // mid
+//      for (int j = 0; j < h_out - 2; ++j) {
+//        output_ptr = output_seg + w_out * (j + 1) + 1;
+//        input_tmp = input_seg + j * w_in;
+//
+//        in0 = vld1q_f32(input_tmp);
+//        in2 = vld1q_f32(input_tmp + w_in);
+//        in4 = vld1q_f32(input_tmp + 2 * w_in);
+//        c_mid = w_out - 2;
+//        for (; c_mid > 3; c_mid -= 4) {
+//          in1 = vld1q_f32(input_tmp + 4);
+//          in3 = vld1q_f32(input_tmp + w_in + 4);
+//          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+//
+//          tmp0 = vextq_f32(in0, in1, 1);
+//          tmp1 = vextq_f32(in0, in1, 2);
+//          tmp2 = vextq_f32(in2, in3, 1);
+//          tmp3 = vextq_f32(in2, in3, 2);
+//          tmp4 = vextq_f32(in4, in5, 1);
+//          tmp5 = vextq_f32(in4, in5, 2);
+//
+//          sum = vaddq_f32(in0, tmp0);
+//          sum = vaddq_f32(sum, tmp1);
+//          sum = vaddq_f32(sum, in2);
+//          sum = vaddq_f32(sum, tmp2);
+//          sum = vaddq_f32(sum, tmp3);
+//          sum = vaddq_f32(sum, in4);
+//          sum = vaddq_f32(sum, tmp4);
+//          sum = vaddq_f32(sum, tmp5);
+//
+//          out0 = vmulq_f32(sum, v_coef);
+//          vst1q_f32(output_ptr, out0);
+//          output_ptr += 4;
+//          input_tmp += 4;
+//          in0 = in1;
+//          in2 = in3;
+//          in4 = in5;
+//        }
+//        // mid remain
+//        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
+//        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+//        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+//
+//        tmp0 = vextq_f32(in0, pad0, 1);
+//        tmp1 = vextq_f32(in0, pad0, 2);
+//        tmp2 = vextq_f32(in2, pad1, 1);
+//        tmp3 = vextq_f32(in2, pad1, 2);
+//        tmp4 = vextq_f32(in4, pad2, 1);
+//        tmp5 = vextq_f32(in4, pad2, 2);
+//
+//        sum = vaddq_f32(in0, tmp0);
+//        sum = vaddq_f32(sum, tmp1);
+//        sum = vaddq_f32(sum, in2);
+//        sum = vaddq_f32(sum, tmp2);
+//        sum = vaddq_f32(sum, tmp3);
+//        sum = vaddq_f32(sum, in4);
+//        sum = vaddq_f32(sum, tmp4);
+//        sum = vaddq_f32(sum, tmp5);
+//        out0 = vmulq_f32(sum, v_coef);
+//
+//        for (int i = 0; i < c_mid; ++i) {
+//          if (i == 0) {
+//            vst1q_lane_f32(output_ptr + i, out0, 0);
+//          }
+//          if (i == 1) {
+//            vst1q_lane_f32(output_ptr + i, out0, 1);
+//          }
+//          if (i == 2) {
+//            vst1q_lane_f32(output_ptr + i, out0, 2);
+//          }
+//        }
+//      }
+//      //      input_data += inputdata_channel_stride;
+//      //      out_data += outputdata_channel_stride;
+//    }
+//    input_data += input_batch_stride;
+//    out_data += output_batch_stride;
+//  }
 #endif
 }
 
@@ -662,6 +839,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
           wstart = max(wstart, 0);
           hend = min(hend, input_height);
           wend = min(wend, input_width);
+
           const float *pos1 = input_seg + hstart * input_width + wstart;
           const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
           const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
@@ -674,7 +852,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 sum += input_seg[h * input_width + w];
               }
             }
-            output_seg[ph * output_width + pw] = sum / 9.0;
+            output_seg[ph * output_width + pw] =
+                sum / ((hend - hstart) * (wend - wstart) * 1.0);
           } else {
 #if __aarch64__
 #else
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index ef31c9b57f510b4f0028e5acd10ebe7786d70fb9..2207c001e9203e39bf687dd01887a9ce0ac654d8 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -317,22 +317,23 @@ class OpParam {
   }
 };
 
-#ifdef CONV_OP
 template <typename Dtype>
-class ConvParam : OpParam {
+class ConvParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
             const AttributeMap &attrs, const Scope &scope) {
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutputFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
+    filter_ = OpParam::FilterFrom<GType>(inputs, scope);
+    input_ = OpParam::InputFrom<GType>(inputs, scope);
+    if (outputs.count("Output")) {
+      output_ = OpParam::OutputFrom<GType>(outputs, scope);
+    }
+    strides_ = OpParam::GetAttr<vector<int>>("strides", attrs);
+    paddings_ = OpParam::GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = OpParam::GetAttr<vector<int>>("dilations", attrs);
+    groups = OpParam::GetAttr<int>("groups", attrs);
   }
 
   const RType *Input() const { return input_; }
@@ -360,7 +361,6 @@ class ConvParam : OpParam {
 };
 template <typename Dtype>
 Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
-#endif
 
 template <typename Dtype>
 class ElementwiseAddParam : OpParam {
@@ -652,6 +652,11 @@ class PriorBoxParam : public OpParam {
     max_sizes_ = GetAttr<vector<float>>("max_sizes", attrs);
     aspect_ratios_ = GetAttr<vector<float>>("aspect_ratios", attrs);
     variances_ = GetAttr<vector<float>>("variances", attrs);
+
+    if (HasAttr("min_max_aspect_ratios_order", attrs)) {
+      min_max_aspect_ratios_order_ =
+          GetAttr<bool>("min_max_aspect_ratios_order", attrs);
+    }
     flip_ = GetAttr<bool>("flip", attrs);
     clip_ = GetAttr<bool>("clip", attrs);
     step_w_ = GetAttr<float>("step_w", attrs);
@@ -684,6 +689,10 @@ class PriorBoxParam : public OpParam {
 
   const float &Offset() const { return offset_; }
 
+  const bool &MinMaxAspectRatiosOrder() const {
+    return min_max_aspect_ratios_order_;
+  }
+
  private:
   RType *input_;
   RType *input_image_;
@@ -698,6 +707,7 @@ class PriorBoxParam : public OpParam {
   float step_w_;
   float step_h_;
   float offset_;
+  bool min_max_aspect_ratios_order_;
 };
 #endif
 
@@ -761,7 +771,7 @@ class SoftmaxParam : public OpParam {
   fpga::BypassArgs fpga_bypass_args;
 
  public:
-  RType *FloatInput() {
+  RType *FloatInput() const {
     return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
   }
   void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
@@ -1260,52 +1270,29 @@ using FusionFcReluParam = FusionFcParam<DeviceType>;
 #endif
 
 template <typename Dtype>
-class FusionConvAddParam : public OpParam {
+class FusionConvAddParam : public ConvParam<Dtype> {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvAddParam(const VariableNameMap &inputs,
                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                     const Scope &scope) {
-    bias_ = InputYFrom<GType>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
+                     const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
   }
   RType *Bias() const { return bias_; }
 
   const int &Axis() const { return axis_; }
 
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
  protected:
   RType *bias_;
   int axis_;
-  RType *input_;
   RType *output_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
@@ -1332,58 +1319,33 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
 #endif
 
 #ifdef FUSION_CONVADDPRELU_OP
-template <typename DeviceType>
-class FusionConvAddPReluParam : public OpParam {
-  typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
-  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
+template <typename Dtype>
+class FusionConvAddPReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvAddPReluParam(const VariableNameMap &inputs,
                           const VariableNameMap &outputs,
-                          const AttributeMap &attrs, const Scope &scope) {
-    alpha_ = InputAlphaFrom<GType>(inputs, scope);
-    mode_ = GetAttr<std::string>("mode", attrs);
+                          const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
+    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
     framework::DDim dims = alpha_->dims();
-    bias_ = InputYFrom<GType>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
   }
   const RType *InputAlpha() const { return alpha_; }
   const std::string &Mode() const { return mode_; }
   RType *Bias() const { return bias_; }
-
   const int &Axis() const { return axis_; }
-
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
  protected:
   RType *bias_;
   int axis_;
-  RType *input_;
   RType *output_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *alpha_;
   std::string mode_;
 #ifdef PADDLE_MOBILE_FPGA
@@ -1399,35 +1361,30 @@ class FusionConvAddPReluParam : public OpParam {
 #endif
 
 #ifdef FUSION_CONVADDADDPRELU_OP
-template <typename DeviceType>
-class FusionConvAddAddPReluParam : public OpParam {
-  typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
-  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
+template <typename Dtype>
+class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvAddAddPReluParam(const VariableNameMap &inputs,
                              const VariableNameMap &outputs,
-                             const AttributeMap &attrs, const Scope &scope) {
-    bias1_ = InputYFrom1<GType>(inputs, scope);
-    alpha_ = InputAlphaFrom<GType>(inputs, scope);
-    mode_ = GetAttr<std::string>("mode", attrs);
+                             const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
+    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
+    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
     framework::DDim dims = alpha_->dims();
-    bias_ = InputYFrom<GType>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    keyOutput_ = getkey("addOut", inputs, 0);
-    keyX1_ = getkey("addX", inputs, 1);
-    keyY1_ = getkey("Y", inputs, 1);
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    keyOutput_ = OpParam::getkey("addOut", inputs, 0);
+    keyX1_ = OpParam::getkey("addX", inputs, 1);
+    keyY1_ = OpParam::getkey("Y", inputs, 1);
     if (keyX1_ == keyOutput_) {
-      bias1_ = InputYFrom1<GType>(inputs, scope);
+      bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
     } else if (keyY1_ == keyOutput_) {
-      bias1_ = InputXFrom1<GType>(inputs, scope);
+      bias1_ = OpParam::InputXFrom1<GType>(inputs, scope);
     }
   }
   const RType *InputAlpha() const { return alpha_; }
@@ -1437,31 +1394,12 @@ class FusionConvAddAddPReluParam : public OpParam {
   RType *Bias() const { return bias_; }
 
   const int &Axis() const { return axis_; }
-
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
  protected:
   RType *bias_;
   int axis_;
-  RType *input_;
   RType *output_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *alpha_;
   std::string mode_;
   RType *bias1_;
@@ -1482,49 +1420,32 @@ class FusionConvAddAddPReluParam : public OpParam {
 
 #ifdef FUSION_CONVADDBNRELU_OP
 template <typename Dtype>
-class FusionConvAddBNReluParam : public OpParam {
+class FusionConvAddBNReluParam : public ConvParam<Dtype> {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvAddBNReluParam(const VariableNameMap &inputs,
                            const VariableNameMap &outputs,
-                           const AttributeMap &attrs, const Scope &scope) {
-    bias_ = InputYFrom<GType>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<GType>(inputs, scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
+                           const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
   }
   RType *Bias() const { return bias_; }
 
   const int &Axis() const { return axis_; }
 
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
   const RType *InputBias() const { return input_bias_; }
 
   const RType *InputMean() const { return input_mean_; }
@@ -1550,13 +1471,7 @@ class FusionConvAddBNReluParam : public OpParam {
  protected:
   RType *bias_;
   int axis_;
-  RType *input_;
   RType *output_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *input_bias_;
   RType *input_mean_;
   RType *input_scale_;
@@ -1580,57 +1495,40 @@ class FusionConvAddBNReluParam : public OpParam {
 
 #ifdef FUSION_CONVBNADDRELU_OP
 template <typename Dtype>
-class FusionConvBNAddReluParam : public OpParam {
+class FusionConvBNAddReluParam : public ConvParam<Dtype> {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvBNAddReluParam(const VariableNameMap &inputs,
                            const VariableNameMap &outputs,
-                           const AttributeMap &attrs, const Scope &scope) {
-    bias_ = InputYFrom<GType>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<GType>(inputs, scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    keyBNY_ = getkey("BNY", inputs, 0);
-    keyX_ = getkey("X", inputs, 0);
-    keyY_ = getkey("Y", inputs, 0);
+                           const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    keyBNY_ = OpParam::getkey("BNY", inputs, 0);
+    keyX_ = OpParam::getkey("X", inputs, 0);
+    keyY_ = OpParam::getkey("Y", inputs, 0);
     if (keyX_ == keyBNY_) {
-      bias_ = InputYFrom<GType>(inputs, scope);
+      bias_ = OpParam::InputYFrom<GType>(inputs, scope);
     } else if (keyY_ == keyBNY_) {
-      bias_ = InputXFrom<GType>(inputs, scope);
+      bias_ = OpParam::InputXFrom<GType>(inputs, scope);
     }
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
   }
   RType *Bias() const { return bias_; }
 
   const int &Axis() const { return axis_; }
 
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
   const RType *InputBias() const { return input_bias_; }
 
   const RType *InputMean() const { return input_mean_; }
@@ -1656,13 +1554,7 @@ class FusionConvBNAddReluParam : public OpParam {
  protected:
   RType *bias_;
   int axis_;
-  RType *input_;
   RType *output_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *input_bias_;
   RType *input_mean_;
   RType *input_scale_;
@@ -1689,44 +1581,26 @@ class FusionConvBNAddReluParam : public OpParam {
 
 #ifdef FUSION_CONVBN_OP
 template <typename Dtype>
-class FusionConvBNParam : public OpParam {
+class FusionConvBNParam : public ConvParam<Dtype> {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvBNParam(const VariableNameMap &inputs,
                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                    const Scope &scope) {
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_y_ = OutputYFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<GType>(inputs, scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
+                    const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_y_ = OpParam::OutputYFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
   }
-
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_y_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
   const RType *InputBias() const { return input_bias_; }
 
   const RType *InputMean() const { return input_mean_; }
@@ -1750,13 +1624,7 @@ class FusionConvBNParam : public OpParam {
   const RType *NewBias() const { return new_bias_; }
 
  protected:
-  RType *input_;
   RType *output_y_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *input_bias_;
   RType *input_mean_;
   RType *input_scale_;
@@ -1780,49 +1648,32 @@ class FusionConvBNParam : public OpParam {
 
 #ifdef FUSION_CONVADDBN_OP
 template <typename Dtype>
-class FusionConvAddBNParam : public OpParam {
+class FusionConvAddBNParam : public ConvParam<Dtype> {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvAddBNParam(const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
-                       const AttributeMap &attrs, const Scope &scope) {
-    bias_ = InputYFrom<GType>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_y_ = OutputYFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<GType>(inputs, scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
+                       const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_y_ = OpParam::OutputYFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
   }
   RType *Bias() const { return bias_; }
 
   const int &Axis() const { return axis_; }
 
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_y_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
   const RType *InputBias() const { return input_bias_; }
 
   const RType *InputMean() const { return input_mean_; }
@@ -1848,13 +1699,7 @@ class FusionConvAddBNParam : public OpParam {
  protected:
   RType *bias_;
   int axis_;
-  RType *input_;
   RType *output_y_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *input_bias_;
   RType *input_mean_;
   RType *input_scale_;
@@ -1878,44 +1723,26 @@ class FusionConvAddBNParam : public OpParam {
 
 #ifdef FUSION_DWCONVBNRELU_OP
 template <typename Dtype>
-class FusionDWConvBNReluParam : public OpParam {
+class FusionDWConvBNReluParam : public ConvParam<Dtype> {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionDWConvBNReluParam(const VariableNameMap &inputs,
                           const VariableNameMap &outputs,
-                          const AttributeMap &attrs, const Scope &scope) {
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutFrom<GType>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<GType>(inputs, scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
+                          const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
   }
-
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
   const RType *InputBias() const { return input_bias_; }
 
   const RType *InputMean() const { return input_mean_; }
@@ -1939,13 +1766,7 @@ class FusionDWConvBNReluParam : public OpParam {
   const RType *NewBias() const { return new_bias_; }
 
  protected:
-  RType *input_;
   RType *output_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *input_bias_;
   RType *input_mean_;
   RType *input_scale_;
@@ -1961,45 +1782,26 @@ class FusionDWConvBNReluParam : public OpParam {
 
 #ifdef FUSION_CONVBNRELU_OP
 template <typename Dtype>
-class FusionConvBNReluParam : public OpParam {
+class FusionConvBNReluParam : public ConvParam<Dtype> {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
   FusionConvBNReluParam(const VariableNameMap &inputs,
                         const VariableNameMap &outputs,
-                        const AttributeMap &attrs, const Scope &scope) {
-    filter_ = FilterFrom<GType>(inputs, scope);
-    input_ = InputFrom<GType>(inputs, scope);
-    output_ = OutFrom<GType>(outputs, scope);
-
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<GType>(inputs, scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
+                        const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
   }
-
-  const RType *Input() const { return input_; }
-
-  const RType *Filter() const { return filter_; }
-
   RType *Output() const { return output_; }
 
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
-
   const RType *InputBias() const { return input_bias_; }
 
   const RType *InputMean() const { return input_mean_; }
@@ -2023,13 +1825,7 @@ class FusionConvBNReluParam : public OpParam {
   const RType *NewBias() const { return new_bias_; }
 
  protected:
-  RType *input_;
   RType *output_;
-  RType *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
   RType *input_bias_;
   RType *input_mean_;
   RType *input_scale_;
@@ -2331,16 +2127,14 @@ class ShapeParam : public OpParam {
 };
 #endif
 
-template<typename Dtype>
+template <typename Dtype>
 class QuantizeParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
-  QuantizeParam(const VariableNameMap &inputs,
-                const VariableNameMap &outputs,
-                const AttributeMap &attrs,
-                const Scope &scope) {
+  QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const AttributeMap &attrs, const Scope &scope) {
     input_ = InputXFrom<GType>(inputs, scope);
     out_ = OutFrom<GType>(outputs, scope);
     if (HasAttr("is_static", attrs)) {
@@ -2375,16 +2169,14 @@ class QuantizeParam : public OpParam {
   RoundType round_type_ = ROUND_NEAREST_TO_EVEN;
 };
 
-template<typename Dtype>
+template <typename Dtype>
 class DequantizeParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
   typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
  public:
-  DequantizeParam(const VariableNameMap &inputs,
-                const VariableNameMap &outputs,
-                const AttributeMap &attrs,
-                const Scope &scope) {
+  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                  const AttributeMap &attrs, const Scope &scope) {
     input_ = InputXFrom<GType>(inputs, scope);
     out_ = OutFrom<GType>(outputs, scope);
     activation_scale_ = GetVarValue<GType>("Scale", inputs, scope);
diff --git a/src/protobuf-c/protobuf-c.h b/src/protobuf-c/protobuf-c.h
old mode 100755
new mode 100644
index 390bf4238eb7204432bc8598af1e7cd0ce8bcf81..5855acee71c01a1ea2156f9c0721ebd9c71dfe3b
--- a/src/protobuf-c/protobuf-c.h
+++ b/src/protobuf-c/protobuf-c.h
@@ -202,40 +202,40 @@ size_t foo__bar__baz_bah__pack_to_buffer
 #include <stdint.h>
 
 #ifdef __cplusplus
-# define PROTOBUF_C__BEGIN_DECLS	extern "C" {
-# define PROTOBUF_C__END_DECLS		}
+#define PROTOBUF_C__BEGIN_DECLS extern "C" {
+#define PROTOBUF_C__END_DECLS }
 #else
-# define PROTOBUF_C__BEGIN_DECLS
-# define PROTOBUF_C__END_DECLS
+#define PROTOBUF_C__BEGIN_DECLS
+#define PROTOBUF_C__END_DECLS
 #endif
 
 PROTOBUF_C__BEGIN_DECLS
 
 #if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
-# ifdef PROTOBUF_C_EXPORT
-#  define PROTOBUF_C__API __declspec(dllexport)
-# else
-#  define PROTOBUF_C__API __declspec(dllimport)
-# endif
+#ifdef PROTOBUF_C_EXPORT
+#define PROTOBUF_C__API __declspec(dllexport)
 #else
-# define PROTOBUF_C__API
+#define PROTOBUF_C__API __declspec(dllimport)
+#endif
+#else
+#define PROTOBUF_C__API
 #endif
 
 #if !defined(PROTOBUF_C__NO_DEPRECATED) && \
-	((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-# define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
+    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
 #else
-# define PROTOBUF_C__DEPRECATED
+#define PROTOBUF_C__DEPRECATED
 #endif
 
 #ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
- #define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
+#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
   , _##enum_name##_IS_INT_SIZE = INT_MAX
 #endif
 
-#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC    0x14159bc3
-#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC    0x28aaeef9
-#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC       0x114315af
+#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
+#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
+#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
 
 /* Empty string used for initializers */
 extern const char protobuf_c_empty_string[];
@@ -253,14 +253,14 @@ extern const char protobuf_c_empty_string[];
  * Values for the `flags` word in `ProtobufCFieldDescriptor`.
  */
 typedef enum {
-	/** Set if the field is repeated and marked with the `packed` option. */
-	PROTOBUF_C_FIELD_FLAG_PACKED		= (1 << 0),
+  /** Set if the field is repeated and marked with the `packed` option. */
+  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
 
-	/** Set if the field is marked with the `deprecated` option. */
-	PROTOBUF_C_FIELD_FLAG_DEPRECATED	= (1 << 1),
+  /** Set if the field is marked with the `deprecated` option. */
+  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
 
-	/** Set if the field is a member of a oneof (union). */
-	PROTOBUF_C_FIELD_FLAG_ONEOF		= (1 << 2),
+  /** Set if the field is a member of a oneof (union). */
+  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
 } ProtobufCFieldFlag;
 
 /**
@@ -272,27 +272,27 @@ typedef enum {
  *      https://developers.google.com/protocol-buffers/docs/proto#simple
  */
 typedef enum {
-	/** A well-formed message must have exactly one of this field. */
-	PROTOBUF_C_LABEL_REQUIRED,
-
-	/**
-	 * A well-formed message can have zero or one of this field (but not
-	 * more than one).
-	 */
-	PROTOBUF_C_LABEL_OPTIONAL,
-
-	/**
-	 * This field can be repeated any number of times (including zero) in a
-	 * well-formed message. The order of the repeated values will be
-	 * preserved.
-	 */
-	PROTOBUF_C_LABEL_REPEATED,
-
-	/**
-	 * This field has no label. This is valid only in proto3 and is
-	 * equivalent to OPTIONAL but no "has" quantifier will be consulted.
-	 */
-	PROTOBUF_C_LABEL_NONE,
+  /** A well-formed message must have exactly one of this field. */
+  PROTOBUF_C_LABEL_REQUIRED,
+
+  /**
+   * A well-formed message can have zero or one of this field (but not
+   * more than one).
+   */
+  PROTOBUF_C_LABEL_OPTIONAL,
+
+  /**
+   * This field can be repeated any number of times (including zero) in a
+   * well-formed message. The order of the repeated values will be
+   * preserved.
+   */
+  PROTOBUF_C_LABEL_REPEATED,
+
+  /**
+   * This field has no label. This is valid only in proto3 and is
+   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
+   */
+  PROTOBUF_C_LABEL_NONE,
 } ProtobufCLabel;
 
 /**
@@ -304,23 +304,23 @@ typedef enum {
  *      https://developers.google.com/protocol-buffers/docs/proto#scalar
  */
 typedef enum {
-	PROTOBUF_C_TYPE_INT32,      /**< int32 */
-	PROTOBUF_C_TYPE_SINT32,     /**< signed int32 */
-	PROTOBUF_C_TYPE_SFIXED32,   /**< signed int32 (4 bytes) */
-	PROTOBUF_C_TYPE_INT64,      /**< int64 */
-	PROTOBUF_C_TYPE_SINT64,     /**< signed int64 */
-	PROTOBUF_C_TYPE_SFIXED64,   /**< signed int64 (8 bytes) */
-	PROTOBUF_C_TYPE_UINT32,     /**< unsigned int32 */
-	PROTOBUF_C_TYPE_FIXED32,    /**< unsigned int32 (4 bytes) */
-	PROTOBUF_C_TYPE_UINT64,     /**< unsigned int64 */
-	PROTOBUF_C_TYPE_FIXED64,    /**< unsigned int64 (8 bytes) */
-	PROTOBUF_C_TYPE_FLOAT,      /**< float */
-	PROTOBUF_C_TYPE_DOUBLE,     /**< double */
-	PROTOBUF_C_TYPE_BOOL,       /**< boolean */
-	PROTOBUF_C_TYPE_ENUM,       /**< enumerated type */
-	PROTOBUF_C_TYPE_STRING,     /**< UTF-8 or ASCII string */
-	PROTOBUF_C_TYPE_BYTES,      /**< arbitrary byte sequence */
-	PROTOBUF_C_TYPE_MESSAGE,    /**< nested message */
+  PROTOBUF_C_TYPE_INT32,    /**< int32 */
+  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
+  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
+  PROTOBUF_C_TYPE_INT64,    /**< int64 */
+  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
+  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
+  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
+  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
+  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
+  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
+  PROTOBUF_C_TYPE_FLOAT,    /**< float */
+  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
+  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
+  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
+  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
+  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
+  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
 } ProtobufCType;
 
 /**
@@ -332,11 +332,11 @@ typedef enum {
  *      https://developers.google.com/protocol-buffers/docs/encoding#structure
  */
 typedef enum {
-	PROTOBUF_C_WIRE_TYPE_VARINT = 0,
-	PROTOBUF_C_WIRE_TYPE_64BIT = 1,
-	PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
-	/* "Start group" and "end group" wire types are unsupported. */
-	PROTOBUF_C_WIRE_TYPE_32BIT = 5,
+  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
+  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
+  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
+  /* "Start group" and "end group" wire types are unsupported. */
+  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
 } ProtobufCWireType;
 
 struct ProtobufCAllocator;
@@ -382,14 +382,14 @@ typedef void (*ProtobufCServiceDestroy)(ProtobufCService *);
  * Structure for defining a custom memory allocator.
  */
 struct ProtobufCAllocator {
-	/** Function to allocate memory. */
-	void		*(*alloc)(void *allocator_data, size_t size);
+  /** Function to allocate memory. */
+  void *(*alloc)(void *allocator_data, size_t size);
 
-	/** Function to free memory. */
-	void		(*free)(void *allocator_data, void *pointer);
+  /** Function to free memory. */
+  void (*free)(void *allocator_data, void *pointer);
 
-	/** Opaque pointer passed to `alloc` and `free` functions. */
-	void		*allocator_data;
+  /** Opaque pointer passed to `alloc` and `free` functions. */
+  void *allocator_data;
 };
 
 /**
@@ -400,8 +400,8 @@ struct ProtobufCAllocator {
  * `NUL`-terminated.
  */
 struct ProtobufCBinaryData {
-	size_t	len;        /**< Number of bytes in the `data` field. */
-	uint8_t	*data;      /**< Data bytes. */
+  size_t len;    /**< Number of bytes in the `data` field. */
+  uint8_t *data; /**< Data bytes. */
 };
 
 /**
@@ -440,10 +440,8 @@ protobuf_c_message_pack_to_buffer(&message, &tmp);
 ~~~
  */
 struct ProtobufCBuffer {
-	/** Append function. Consumes the `len` bytes stored at `data`. */
-	void		(*append)(ProtobufCBuffer *buffer,
-				  size_t len,
-				  const uint8_t *data);
+  /** Append function. Consumes the `len` bytes stored at `data`. */
+  void (*append)(ProtobufCBuffer *buffer, size_t len, const uint8_t *data);
 };
 
 /**
@@ -475,142 +473,142 @@ PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
  * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
  */
 struct ProtobufCBufferSimple {
-	/** "Base class". */
-	ProtobufCBuffer		base;
-	/** Number of bytes allocated in `data`. */
-	size_t			alloced;
-	/** Number of bytes currently stored in `data`. */
-	size_t			len;
-	/** Data bytes. */
-	uint8_t			*data;
-	/** Whether `data` must be freed. */
-	protobuf_c_boolean	must_free_data;
-	/** Allocator to use. May be NULL to indicate the system allocator. */
-	ProtobufCAllocator	*allocator;
+  /** "Base class". */
+  ProtobufCBuffer base;
+  /** Number of bytes allocated in `data`. */
+  size_t alloced;
+  /** Number of bytes currently stored in `data`. */
+  size_t len;
+  /** Data bytes. */
+  uint8_t *data;
+  /** Whether `data` must be freed. */
+  protobuf_c_boolean must_free_data;
+  /** Allocator to use. May be NULL to indicate the system allocator. */
+  ProtobufCAllocator *allocator;
 };
 
 /**
  * Describes an enumeration as a whole, with all of its values.
  */
 struct ProtobufCEnumDescriptor {
-	/** Magic value checked to ensure that the API is used correctly. */
-	uint32_t			magic;
-
-	/** The qualified name (e.g., "namespace.Type"). */
-	const char			*name;
-	/** The unqualified name as given in the .proto file (e.g., "Type"). */
-	const char			*short_name;
-	/** Identifier used in generated C code. */
-	const char			*c_name;
-	/** The dot-separated namespace. */
-	const char			*package_name;
-
-	/** Number elements in `values`. */
-	unsigned			n_values;
-	/** Array of distinct values, sorted by numeric value. */
-	const ProtobufCEnumValue	*values;
-
-	/** Number of elements in `values_by_name`. */
-	unsigned			n_value_names;
-	/** Array of named values, including aliases, sorted by name. */
-	const ProtobufCEnumValueIndex	*values_by_name;
-
-	/** Number of elements in `value_ranges`. */
-	unsigned			n_value_ranges;
-	/** Value ranges, for faster lookups by numeric value. */
-	const ProtobufCIntRange		*value_ranges;
-
-	/** Reserved for future use. */
-	void				*reserved1;
-	/** Reserved for future use. */
-	void				*reserved2;
-	/** Reserved for future use. */
-	void				*reserved3;
-	/** Reserved for future use. */
-	void				*reserved4;
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /** Number elements in `values`. */
+  unsigned n_values;
+  /** Array of distinct values, sorted by numeric value. */
+  const ProtobufCEnumValue *values;
+
+  /** Number of elements in `values_by_name`. */
+  unsigned n_value_names;
+  /** Array of named values, including aliases, sorted by name. */
+  const ProtobufCEnumValueIndex *values_by_name;
+
+  /** Number of elements in `value_ranges`. */
+  unsigned n_value_ranges;
+  /** Value ranges, for faster lookups by numeric value. */
+  const ProtobufCIntRange *value_ranges;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+  /** Reserved for future use. */
+  void *reserved4;
 };
 
 /**
  * Represents a single value of an enumeration.
  */
 struct ProtobufCEnumValue {
-	/** The string identifying this value in the .proto file. */
-	const char	*name;
+  /** The string identifying this value in the .proto file. */
+  const char *name;
 
-	/** The string identifying this value in generated C code. */
-	const char	*c_name;
+  /** The string identifying this value in generated C code. */
+  const char *c_name;
 
-	/** The numeric value assigned in the .proto file. */
-	int		value;
+  /** The numeric value assigned in the .proto file. */
+  int value;
 };
 
 /**
  * Used by `ProtobufCEnumDescriptor` to look up enum values.
  */
 struct ProtobufCEnumValueIndex {
-	/** Name of the enum value. */
-	const char      *name;
-	/** Index into values[] array. */
-	unsigned        index;
+  /** Name of the enum value. */
+  const char *name;
+  /** Index into values[] array. */
+  unsigned index;
 };
 
 /**
  * Describes a single field in a message.
  */
 struct ProtobufCFieldDescriptor {
-	/** Name of the field as given in the .proto file. */
-	const char		*name;
-
-	/** Tag value of the field as given in the .proto file. */
-	uint32_t		id;
-
-	/** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
-	ProtobufCLabel		label;
-
-	/** The type of the field. */
-	ProtobufCType		type;
-
-	/**
-	 * The offset in bytes of the message's C structure's quantifier field
-	 * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
-	 * for repeated members or the case enum for oneofs).
-	 */
-	unsigned		quantifier_offset;
-
-	/**
-	 * The offset in bytes into the message's C structure for the member
-	 * itself.
-	 */
-	unsigned		offset;
-
-	/**
-	 * A type-specific descriptor.
-	 *
-	 * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
-	 * corresponding `ProtobufCEnumDescriptor`.
-	 *
-	 * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
-	 * the corresponding `ProtobufCMessageDescriptor`.
-	 *
-	 * Otherwise this field is NULL.
-	 */
-	const void		*descriptor; /* for MESSAGE and ENUM types */
-
-	/** The default value for this field, if defined. May be NULL. */
-	const void		*default_value;
-
-	/**
-	 * A flag word. Zero or more of the bits defined in the
-	 * `ProtobufCFieldFlag` enum may be set.
-	 */
-	uint32_t		flags;
-
-	/** Reserved for future use. */
-	unsigned		reserved_flags;
-	/** Reserved for future use. */
-	void			*reserved2;
-	/** Reserved for future use. */
-	void			*reserved3;
+  /** Name of the field as given in the .proto file. */
+  const char *name;
+
+  /** Tag value of the field as given in the .proto file. */
+  uint32_t id;
+
+  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
+  ProtobufCLabel label;
+
+  /** The type of the field. */
+  ProtobufCType type;
+
+  /**
+   * The offset in bytes of the message's C structure's quantifier field
+   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
+   * for repeated members or the case enum for oneofs).
+   */
+  unsigned quantifier_offset;
+
+  /**
+   * The offset in bytes into the message's C structure for the member
+   * itself.
+   */
+  unsigned offset;
+
+  /**
+   * A type-specific descriptor.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
+   * corresponding `ProtobufCEnumDescriptor`.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
+   * the corresponding `ProtobufCMessageDescriptor`.
+   *
+   * Otherwise this field is NULL.
+   */
+  const void *descriptor; /* for MESSAGE and ENUM types */
+
+  /** The default value for this field, if defined. May be NULL. */
+  const void *default_value;
+
+  /**
+   * A flag word. Zero or more of the bits defined in the
+   * `ProtobufCFieldFlag` enum may be set.
+   */
+  uint32_t flags;
+
+  /** Reserved for future use. */
+  unsigned reserved_flags;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
 };
 
 /**
@@ -622,13 +620,13 @@ struct ProtobufCFieldDescriptor {
  * sorted.
  */
 struct ProtobufCIntRange {
-	int             start_value;
-	unsigned        orig_index;
-	/*
-	 * NOTE: the number of values in the range can be inferred by looking
-	 * at the next element's orig_index. A dummy element is added to make
-	 * this simple.
-	 */
+  int start_value;
+  unsigned orig_index;
+  /*
+   * NOTE: the number of values in the range can be inferred by looking
+   * at the next element's orig_index. A dummy element is added to make
+   * this simple.
+   */
 };
 
 /**
@@ -647,122 +645,120 @@ struct ProtobufCIntRange {
  * like protobuf_c_message_free_unpacked().
  */
 struct ProtobufCMessage {
-	/** The descriptor for this message type. */
-	const ProtobufCMessageDescriptor	*descriptor;
-	/** The number of elements in `unknown_fields`. */
-	unsigned				n_unknown_fields;
-	/** The fields that weren't recognized by the parser. */
-	ProtobufCMessageUnknownField		*unknown_fields;
+  /** The descriptor for this message type. */
+  const ProtobufCMessageDescriptor *descriptor;
+  /** The number of elements in `unknown_fields`. */
+  unsigned n_unknown_fields;
+  /** The fields that weren't recognized by the parser. */
+  ProtobufCMessageUnknownField *unknown_fields;
 };
 
 /**
  * Describes a message.
  */
 struct ProtobufCMessageDescriptor {
-	/** Magic value checked to ensure that the API is used correctly. */
-	uint32_t			magic;
-
-	/** The qualified name (e.g., "namespace.Type"). */
-	const char			*name;
-	/** The unqualified name as given in the .proto file (e.g., "Type"). */
-	const char			*short_name;
-	/** Identifier used in generated C code. */
-	const char			*c_name;
-	/** The dot-separated namespace. */
-	const char			*package_name;
-
-	/**
-	 * Size in bytes of the C structure representing an instance of this
-	 * type of message.
-	 */
-	size_t				sizeof_message;
-
-	/** Number of elements in `fields`. */
-	unsigned			n_fields;
-	/** Field descriptors, sorted by tag number. */
-	const ProtobufCFieldDescriptor	*fields;
-	/** Used for looking up fields by name. */
-	const unsigned			*fields_sorted_by_name;
-
-	/** Number of elements in `field_ranges`. */
-	unsigned			n_field_ranges;
-	/** Used for looking up fields by id. */
-	const ProtobufCIntRange		*field_ranges;
-
-	/** Message initialisation function. */
-	ProtobufCMessageInit		message_init;
-
-	/** Reserved for future use. */
-	void				*reserved1;
-	/** Reserved for future use. */
-	void				*reserved2;
-	/** Reserved for future use. */
-	void				*reserved3;
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /**
+   * Size in bytes of the C structure representing an instance of this
+   * type of message.
+   */
+  size_t sizeof_message;
+
+  /** Number of elements in `fields`. */
+  unsigned n_fields;
+  /** Field descriptors, sorted by tag number. */
+  const ProtobufCFieldDescriptor *fields;
+  /** Used for looking up fields by name. */
+  const unsigned *fields_sorted_by_name;
+
+  /** Number of elements in `field_ranges`. */
+  unsigned n_field_ranges;
+  /** Used for looking up fields by id. */
+  const ProtobufCIntRange *field_ranges;
+
+  /** Message initialisation function. */
+  ProtobufCMessageInit message_init;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
 };
 
 /**
  * An unknown message field.
  */
 struct ProtobufCMessageUnknownField {
-	/** The tag number. */
-	uint32_t		tag;
-	/** The wire type of the field. */
-	ProtobufCWireType	wire_type;
-	/** Number of bytes in `data`. */
-	size_t			len;
-	/** Field data. */
-	uint8_t			*data;
+  /** The tag number. */
+  uint32_t tag;
+  /** The wire type of the field. */
+  ProtobufCWireType wire_type;
+  /** Number of bytes in `data`. */
+  size_t len;
+  /** Field data. */
+  uint8_t *data;
 };
 
 /**
  * Method descriptor.
  */
 struct ProtobufCMethodDescriptor {
-	/** Method name. */
-	const char				*name;
-	/** Input message descriptor. */
-	const ProtobufCMessageDescriptor	*input;
-	/** Output message descriptor. */
-	const ProtobufCMessageDescriptor	*output;
+  /** Method name. */
+  const char *name;
+  /** Input message descriptor. */
+  const ProtobufCMessageDescriptor *input;
+  /** Output message descriptor. */
+  const ProtobufCMessageDescriptor *output;
 };
 
 /**
  * Service.
  */
 struct ProtobufCService {
-	/** Service descriptor. */
-	const ProtobufCServiceDescriptor *descriptor;
-	/** Function to invoke the service. */
-	void (*invoke)(ProtobufCService *service,
-		       unsigned method_index,
-		       const ProtobufCMessage *input,
-		       ProtobufCClosure closure,
-		       void *closure_data);
-	/** Function to destroy the service. */
-	void (*destroy)(ProtobufCService *service);
+  /** Service descriptor. */
+  const ProtobufCServiceDescriptor *descriptor;
+  /** Function to invoke the service. */
+  void (*invoke)(ProtobufCService *service, unsigned method_index,
+                 const ProtobufCMessage *input, ProtobufCClosure closure,
+                 void *closure_data);
+  /** Function to destroy the service. */
+  void (*destroy)(ProtobufCService *service);
 };
 
 /**
  * Service descriptor.
  */
 struct ProtobufCServiceDescriptor {
-	/** Magic value checked to ensure that the API is used correctly. */
-	uint32_t			magic;
-
-	/** Service name. */
-	const char			*name;
-	/** Short version of service name. */
-	const char			*short_name;
-	/** C identifier for the service name. */
-	const char			*c_name;
-	/** Package name. */
-	const char			*package;
-	/** Number of elements in `methods`. */
-	unsigned			n_methods;
-	/** Method descriptors, in the order defined in the .proto file. */
-	const ProtobufCMethodDescriptor	*methods;
-	/** Sort index of methods. */
-	const unsigned			*method_indices_by_name;
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** Service name. */
+  const char *name;
+  /** Short version of service name. */
+  const char *short_name;
+  /** C identifier for the service name. */
+  const char *c_name;
+  /** Package name. */
+  const char *package;
+  /** Number of elements in `methods`. */
+  unsigned n_methods;
+  /** Method descriptors, in the order defined in the .proto file. */
+  const ProtobufCMethodDescriptor *methods;
+  /** Sort index of methods. */
+  const unsigned *method_indices_by_name;
 };
 
 /**
@@ -772,8 +768,7 @@ struct ProtobufCServiceDescriptor {
  * \return A string containing the version number of protobuf-c.
  */
 PROTOBUF_C__API
-const char *
-protobuf_c_version(void);
+const char *protobuf_c_version(void);
 
 /**
  * Get the version of the protobuf-c library. Note that this is the version of
@@ -783,26 +778,25 @@ protobuf_c_version(void);
  *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
  */
 PROTOBUF_C__API
-uint32_t
-protobuf_c_version_number(void);
+uint32_t protobuf_c_version_number(void);
 
 /**
  * The version of the protobuf-c headers, represented as a string using the same
  * format as protobuf_c_version().
  */
-#define PROTOBUF_C_VERSION		"1.3.1"
+#define PROTOBUF_C_VERSION "1.3.1"
 
 /**
  * The version of the protobuf-c headers, represented as an integer using the
  * same format as protobuf_c_version_number().
  */
-#define PROTOBUF_C_VERSION_NUMBER	1003001
+#define PROTOBUF_C_VERSION_NUMBER 1003001
 
 /**
  * The minimum protoc-c version which works with the current version of the
  * protobuf-c headers.
  */
-#define PROTOBUF_C_MIN_COMPILER_VERSION	1000000
+#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
 
 /**
  * Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by name.
@@ -818,10 +812,8 @@ protobuf_c_version_number(void);
  *      If not found or if the optimize_for = CODE_SIZE option was set.
  */
 PROTOBUF_C__API
-const ProtobufCEnumValue *
-protobuf_c_enum_descriptor_get_value_by_name(
-	const ProtobufCEnumDescriptor *desc,
-	const char *name);
+const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value_by_name(
+    const ProtobufCEnumDescriptor *desc, const char *name);
 
 /**
  * Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by numeric
@@ -839,10 +831,8 @@ protobuf_c_enum_descriptor_get_value_by_name(
  *      If not found.
  */
 PROTOBUF_C__API
-const ProtobufCEnumValue *
-protobuf_c_enum_descriptor_get_value(
-	const ProtobufCEnumDescriptor *desc,
-	int value);
+const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value(
+    const ProtobufCEnumDescriptor *desc, int value);
 
 /**
  * Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
@@ -858,10 +848,8 @@ protobuf_c_enum_descriptor_get_value(
  *      If not found or if the optimize_for = CODE_SIZE option was set.
  */
 PROTOBUF_C__API
-const ProtobufCFieldDescriptor *
-protobuf_c_message_descriptor_get_field_by_name(
-	const ProtobufCMessageDescriptor *desc,
-	const char *name);
+const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field_by_name(
+    const ProtobufCMessageDescriptor *desc, const char *name);
 
 /**
  * Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
@@ -877,10 +865,8 @@ protobuf_c_message_descriptor_get_field_by_name(
  *      If not found.
  */
 PROTOBUF_C__API
-const ProtobufCFieldDescriptor *
-protobuf_c_message_descriptor_get_field(
-	const ProtobufCMessageDescriptor *desc,
-	unsigned value);
+const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field(
+    const ProtobufCMessageDescriptor *desc, unsigned value);
 
 /**
  * Determine the number of bytes required to store the serialised message.
@@ -891,8 +877,7 @@ protobuf_c_message_descriptor_get_field(
  *      Number of bytes.
  */
 PROTOBUF_C__API
-size_t
-protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
+size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
 
 /**
  * Serialise a message from its in-memory representation.
@@ -911,8 +896,7 @@ protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
  *      Number of bytes stored in `out`.
  */
 PROTOBUF_C__API
-size_t
-protobuf_c_message_pack(const ProtobufCMessage *message, uint8_t *out);
+size_t protobuf_c_message_pack(const ProtobufCMessage *message, uint8_t *out);
 
 /**
  * Serialise a message from its in-memory representation to a virtual buffer.
@@ -928,10 +912,8 @@ protobuf_c_message_pack(const ProtobufCMessage *message, uint8_t *out);
  *      Number of bytes passed to the virtual buffer.
  */
 PROTOBUF_C__API
-size_t
-protobuf_c_message_pack_to_buffer(
-	const ProtobufCMessage *message,
-	ProtobufCBuffer *buffer);
+size_t protobuf_c_message_pack_to_buffer(const ProtobufCMessage *message,
+                                         ProtobufCBuffer *buffer);
 
 /**
  * Unpack a serialised message into an in-memory representation.
@@ -951,12 +933,9 @@ protobuf_c_message_pack_to_buffer(
  *      If an error occurred during unpacking.
  */
 PROTOBUF_C__API
-ProtobufCMessage *
-protobuf_c_message_unpack(
-	const ProtobufCMessageDescriptor *descriptor,
-	ProtobufCAllocator *allocator,
-	size_t len,
-	const uint8_t *data);
+ProtobufCMessage *protobuf_c_message_unpack(
+    const ProtobufCMessageDescriptor *descriptor, ProtobufCAllocator *allocator,
+    size_t len, const uint8_t *data);
 
 /**
  * Free an unpacked message object.
@@ -971,10 +950,8 @@ protobuf_c_message_unpack(
  *      specify the default allocator.
  */
 PROTOBUF_C__API
-void
-protobuf_c_message_free_unpacked(
-	ProtobufCMessage *message,
-	ProtobufCAllocator *allocator);
+void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
+                                      ProtobufCAllocator *allocator);
 
 /**
  * Check the validity of a message object.
@@ -988,11 +965,11 @@ protobuf_c_message_free_unpacked(
  *      Message is invalid.
  */
 PROTOBUF_C__API
-protobuf_c_boolean
-protobuf_c_message_check(const ProtobufCMessage *);
+protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *);
 
 /** Message initialiser. */
-#define PROTOBUF_C_MESSAGE_INIT(descriptor) { descriptor, 0, NULL }
+#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
+  { descriptor, 0, NULL }
 
 /**
  * Initialise a message object from a message descriptor.
@@ -1003,10 +980,8 @@ protobuf_c_message_check(const ProtobufCMessage *);
  *      Allocated block of memory of size `descriptor->sizeof_message`.
  */
 PROTOBUF_C__API
-void
-protobuf_c_message_init(
-	const ProtobufCMessageDescriptor *descriptor,
-	void *message);
+void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
+                             void *message);
 
 /**
  * Free a service.
@@ -1015,8 +990,7 @@ protobuf_c_message_init(
  *      The service object to free.
  */
 PROTOBUF_C__API
-void
-protobuf_c_service_destroy(ProtobufCService *service);
+void protobuf_c_service_destroy(ProtobufCService *service);
 
 /**
  * Look up a `ProtobufCMethodDescriptor` by name.
@@ -1034,36 +1008,29 @@ protobuf_c_service_destroy(ProtobufCService *service);
 PROTOBUF_C__API
 const ProtobufCMethodDescriptor *
 protobuf_c_service_descriptor_get_method_by_name(
-	const ProtobufCServiceDescriptor *desc,
-	const char *name);
+    const ProtobufCServiceDescriptor *desc, const char *name);
 
 /**
  * Initialise a `ProtobufCBufferSimple` object.
  */
-#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)                   \
-{                                                                       \
-	{ protobuf_c_buffer_simple_append },                            \
-	sizeof(array_of_bytes),                                         \
-	0,                                                              \
-	(array_of_bytes),                                               \
-	0,                                                              \
-	NULL                                                            \
-}
+#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)             \
+  {                                                               \
+    {protobuf_c_buffer_simple_append}, sizeof(array_of_bytes), 0, \
+        (array_of_bytes), 0, NULL                                 \
+  }
 
 /**
  * Clear a `ProtobufCBufferSimple` object, freeing any allocated memory.
  */
-#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                        \
-do {                                                                    \
-	if ((simp_buf)->must_free_data) {                               \
-		if ((simp_buf)->allocator != NULL)                      \
-			(simp_buf)->allocator->free(                    \
-				(simp_buf)->allocator,                  \
-				(simp_buf)->data);			\
-		else                                                    \
-			free((simp_buf)->data);                         \
-	}                                                               \
-} while (0)
+#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
+  do {                                                                        \
+    if ((simp_buf)->must_free_data) {                                         \
+      if ((simp_buf)->allocator != NULL)                                      \
+        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
+      else                                                                    \
+        free((simp_buf)->data);                                               \
+    }                                                                         \
+  } while (0)
 
 /**
  * The `append` method for `ProtobufCBufferSimple`.
@@ -1077,27 +1044,20 @@ do {                                                                    \
  *      Data to append.
  */
 PROTOBUF_C__API
-void
-protobuf_c_buffer_simple_append(
-	ProtobufCBuffer *buffer,
-	size_t len,
-	const unsigned char *data);
+void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
+                                     const unsigned char *data);
 
 PROTOBUF_C__API
-void
-protobuf_c_service_generated_init(
-	ProtobufCService *service,
-	const ProtobufCServiceDescriptor *descriptor,
-	ProtobufCServiceDestroy destroy);
+void protobuf_c_service_generated_init(
+    ProtobufCService *service, const ProtobufCServiceDescriptor *descriptor,
+    ProtobufCServiceDestroy destroy);
 
 PROTOBUF_C__API
-void
-protobuf_c_service_invoke_internal(
-	ProtobufCService *service,
-	unsigned method_index,
-	const ProtobufCMessage *input,
-	ProtobufCClosure closure,
-	void *closure_data);
+void protobuf_c_service_invoke_internal(ProtobufCService *service,
+                                        unsigned method_index,
+                                        const ProtobufCMessage *input,
+                                        ProtobufCClosure closure,
+                                        void *closure_data);
 
 /**@}*/
 
diff --git a/tools/build.sh b/tools/build.sh
index 3b7204baefe6d843cbb4d0a237cf5e96f0c28373..1408822e46850752bcd448350fc483c25f70ae9a 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 NETS=""
-declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp")
+declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet")
 
 build_for_mac() {
     if [ ! `which brew` ]; then
diff --git a/tools/op.cmake b/tools/op.cmake
index e17afb445dcb2ccb12c1cce4e05e4531c3e8cde9..6158a318140cd4befebb68434dc8ef53d1b7cd07 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -177,6 +177,7 @@ if (CON GREATER -1)
   set(FOUND_MATCH ON)
 endif()
 
+
 if(NOT FOUND_MATCH)
   message("--default--")
   set(BATCHNORM_OP ON)
@@ -385,4 +386,4 @@ endif()
 
 if (SHAPE_OP)
   add_definitions(-DSHAPE_OP)
-endif()
+endif()
\ No newline at end of file
diff --git a/tools/pre-commit.hooks/cpplint.hook b/tools/pre-commit.hooks/cpplint.hook
index 1c9c0a04cf358dd0001f75d3b4865c8d7fc934c6..15541fc0be340e2ca5c296d78f702b0190b5ffea 100644
--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -3,7 +3,7 @@
 TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v "protobuf-c.*"); do
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep -v ".pb.cpp" | grep -v ".pb.h"); do
     cpplint $file;
     TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done