Merge remote-tracking branch 'upstream/develop' into develop

c5ad3169 · qnqinan · d1616b88 · 09e0f909 · c5ad3169 · c5ad3169
237 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -96,3 +96,7 @@ metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
 metal/paddle-mobile-demo/paddle-mobile-demo/images
 metal/paddle-mobile-demo/paddle-mobile-demo/models
+metal/paddle-mobile-demo/paddle-mobile-demo/Resources
+metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
+metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
+metal/MobileNetDemo/MobileNetDemo/Resources
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0.0)
-option(USE_OPENMP "openmp support" ON)
+option(USE_OPENMP    "build with openmp support" ON)
-option(DEBUGING "enable debug mode" ON)
+option(USE_EXCEPTION "build with exception" ON)
-option(USE_EXCEPTION "use std exception" ON)
+option(WITH_LOGGING  "print logging for debug" ON)
-option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io
+option(WITH_SYMBOL   "build with all symbols" ON) # turn off if use jni or ios io
-option(LOG_PROFILE "log profile" OFF)
+option(WITH_PROFILE  "print op profile for debug" OFF)
+option(WITH_TEST     "build with unit tests" ON)
 # select the platform to build
-option(CPU "armv7 with neon" ON)
+option(CPU        "build with arm CPU support" ON)
-option(GPU_MALI "mali gpu" OFF)
+option(GPU_MALI   "build with arm mali GPU support" OFF)
-option(GPU_CL "opencl gpu" OFF)
+option(GPU_CL     "build with OpenCL support" OFF)
+option(FPGA       "build with FPGA support" OFF)
-option(FPGA "fpga" OFF)
 if(FPGA)
-    option(FPGAV1 "fpga v1" ON)
+  option(FPGAV1     "build with fpga v1 support" ON)
-    option(FPGAV2 "fpga v2" OFF)
+  option(FPGAV2     "build with fpga v2 support" OFF)
 endif()
 project(paddle-mobile)
@@ -23,7 +23,6 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)
-set(CMAKE_BUILD_TYPE Release)
 set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}")
 if(IS_IOS)
    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
@@ -33,13 +32,18 @@ else()
    set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
 endif()
-if(DEBUGING)
+if(USE_OPENMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+endif()
+if(WITH_LOGGING)
    message(STATUS "debugging mode")
    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
 endif()
-if(SYMBOL_HIDDEN)
+if(NOT WITH_SYMBOL)
    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
 endif()
@@ -50,15 +54,10 @@ else()
    add_definitions(-fno-exceptions)
 endif()
-if(LOG_PROFILE)
+if(WITH_PROFILE)
    add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
-endif()
 # platform control
 if(ARM_LINUX)
    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
@@ -217,7 +216,6 @@ endif()
 set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 # build library
 if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
@@ -239,7 +237,7 @@ else()
 endif()
 # unit test
-if(DEBUGING)
+if(WITH_TEST AND WITH_SYMBOL)
    if(IS_IOS)
    else()
        add_subdirectory(test)

--- a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj
+++ b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+/* Begin PBXBuildFile section */
+		FA37E99B9AD29A07FEE8E743 /* Pods_MobileNetDemo.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD3A2E3175627EF63DACA36C /* Pods_MobileNetDemo.framework */; };
+		FC74BB3621DFAFEC0055232B /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC74BB3521DFAFEC0055232B /* MobileNet.swift */; };
+		FCB40DA221E0B7C60075EC91 /* MobilenetPreProcess.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCB40DA121E0B7C60075EC91 /* MobilenetPreProcess.metal */; };
+		FCB40DA421E0B85B0075EC91 /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCB40DA321E0B85B0075EC91 /* MetalHelper.swift */; };
+		FCB40DE921E0B9410075EC91 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FCB40DD221E0B9410075EC91 /* banana.jpeg */; };
+		FCB40E5121E0CEBB0075EC91 /* mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */; };
+		FCB40E5221E0CEBB0075EC91 /* mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E5021E0CEBB0075EC91 /* mobilenet_params */; };
+		FCB40E5421E0CEF80075EC91 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E5321E0CEF80075EC91 /* synset.txt */; };
+		FCD3873821E1C31F0052F3D0 /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */; };
+		FCD3873921E1C31F0052F3D0 /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		FCF2870921DFAEC7009A87DA /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2870821DFAEC7009A87DA /* AppDelegate.swift */; };
+		FCF2870B21DFAEC7009A87DA /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2870A21DFAEC7009A87DA /* ViewController.swift */; };
+		FCF2870E21DFAEC7009A87DA /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCF2870C21DFAEC7009A87DA /* Main.storyboard */; };
+		FCF2871021DFAEC8009A87DA /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FCF2870F21DFAEC8009A87DA /* Assets.xcassets */; };
+		FCF2871321DFAEC8009A87DA /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCF2871121DFAEC8009A87DA /* LaunchScreen.storyboard */; };
+/* End PBXBuildFile section */
+/* Begin PBXCopyFilesBuildPhase section */
+		FCB40DFC21E0BC360075EC91 /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				FCD3873921E1C31F0052F3D0 /* paddle_mobile.framework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+/* Begin PBXFileReference section */
+		4FE67FF667A24FCB0134F627 /* Pods-MobileNetDemo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-MobileNetDemo.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo.debug.xcconfig"; sourceTree = "<group>"; };
+		DD3A2E3175627EF63DACA36C /* Pods_MobileNetDemo.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_MobileNetDemo.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		E57059FE3629E3A8DE6C7ECF /* Pods-MobileNetDemo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-MobileNetDemo.release.xcconfig"; path = "../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo.release.xcconfig"; sourceTree = "<group>"; };
+		FC74BB3521DFAFEC0055232B /* MobileNet.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
+		FCB40DA121E0B7C60075EC91 /* MobilenetPreProcess.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = MobilenetPreProcess.metal; sourceTree = "<group>"; };
+		FCB40DA321E0B85B0075EC91 /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; name = MetalHelper.swift; path = "../../paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift"; sourceTree = "<group>"; };
+		FCB40DD221E0B9410075EC91 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_model; sourceTree = "<group>"; };
+		FCB40E5021E0CEBB0075EC91 /* mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_params; sourceTree = "<group>"; };
+		FCB40E5321E0CEF80075EC91 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		FCF2870521DFAEC7009A87DA /* MobileNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MobileNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		FCF2870821DFAEC7009A87DA /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
+		FCF2870A21DFAEC7009A87DA /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
+		FCF2870D21DFAEC7009A87DA /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		FCF2870F21DFAEC8009A87DA /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		FCF2871221DFAEC8009A87DA /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		FCF2871421DFAEC8009A87DA /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+/* Begin PBXFrameworksBuildPhase section */
+		FCF2870221DFAEC7009A87DA /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FCD3873821E1C31F0052F3D0 /* paddle_mobile.framework in Frameworks */,
+				FA37E99B9AD29A07FEE8E743 /* Pods_MobileNetDemo.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+/* Begin PBXGroup section */
+		0DDBA47E92A64BC7B0385B0F /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				DD3A2E3175627EF63DACA36C /* Pods_MobileNetDemo.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		1EACBAAF38D9EDE0AC2B3F90 /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				4FE67FF667A24FCB0134F627 /* Pods-MobileNetDemo.debug.xcconfig */,
+				E57059FE3629E3A8DE6C7ECF /* Pods-MobileNetDemo.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		FCB40DCF21E0B9410075EC91 /* Resources */ = {
+			isa = PBXGroup;
+			children = (
+				FCB40DD021E0B9410075EC91 /* images */,
+				FCB40DD921E0B9410075EC91 /* models */,
+			);
+			path = Resources;
+			sourceTree = "<group>";
+		};
+		FCB40DD021E0B9410075EC91 /* images */ = {
+			isa = PBXGroup;
+			children = (
+				FCB40DD221E0B9410075EC91 /* banana.jpeg */,
+			);
+			path = images;
+			sourceTree = "<group>";
+		};
+		FCB40DD921E0B9410075EC91 /* models */ = {
+			isa = PBXGroup;
+			children = (
+				FCB40E4E21E0CEBB0075EC91 /* mobilenet_combine */,
+			);
+			path = models;
+			sourceTree = "<group>";
+		};
+		FCB40E4E21E0CEBB0075EC91 /* mobilenet_combine */ = {
+			isa = PBXGroup;
+			children = (
+				FCB40E5321E0CEF80075EC91 /* synset.txt */,
+				FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */,
+				FCB40E5021E0CEBB0075EC91 /* mobilenet_params */,
+			);
+			path = mobilenet_combine;
+			sourceTree = "<group>";
+		};
+		FCF286FC21DFAEC7009A87DA = {
+			isa = PBXGroup;
+			children = (
+				FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */,
+				FCF2870721DFAEC7009A87DA /* MobileNetDemo */,
+				FCF2870621DFAEC7009A87DA /* Products */,
+				1EACBAAF38D9EDE0AC2B3F90 /* Pods */,
+				0DDBA47E92A64BC7B0385B0F /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		FCF2870621DFAEC7009A87DA /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				FCF2870521DFAEC7009A87DA /* MobileNetDemo.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		FCF2870721DFAEC7009A87DA /* MobileNetDemo */ = {
+			isa = PBXGroup;
+			children = (
+				FCB40DCF21E0B9410075EC91 /* Resources */,
+				FCB40DA321E0B85B0075EC91 /* MetalHelper.swift */,
+				FC74BB3521DFAFEC0055232B /* MobileNet.swift */,
+				FCF2870821DFAEC7009A87DA /* AppDelegate.swift */,
+				FCF2870A21DFAEC7009A87DA /* ViewController.swift */,
+				FCF2870C21DFAEC7009A87DA /* Main.storyboard */,
+				FCF2870F21DFAEC8009A87DA /* Assets.xcassets */,
+				FCF2871121DFAEC8009A87DA /* LaunchScreen.storyboard */,
+				FCF2871421DFAEC8009A87DA /* Info.plist */,
+				FCB40DA121E0B7C60075EC91 /* MobilenetPreProcess.metal */,
+			);
+			path = MobileNetDemo;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+/* Begin PBXNativeTarget section */
+		FCF2870421DFAEC7009A87DA /* MobileNetDemo */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = FCF2871721DFAEC8009A87DA /* Build configuration list for PBXNativeTarget "MobileNetDemo" */;
+			buildPhases = (
+				B4EB56AEEFF6F3965DA3D2DA /* [CP] Check Pods Manifest.lock */,
+				FCF2870121DFAEC7009A87DA /* Sources */,
+				FCF2870221DFAEC7009A87DA /* Frameworks */,
+				FCF2870321DFAEC7009A87DA /* Resources */,
+				1D801B9681ACFCA70D444D2C /* [CP] Embed Pods Frameworks */,
+				FCB40DFC21E0BC360075EC91 /* Embed Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = MobileNetDemo;
+			productName = MobileNetDemo;
+			productReference = FCF2870521DFAEC7009A87DA /* MobileNetDemo.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+/* Begin PBXProject section */
+		FCF286FD21DFAEC7009A87DA /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 1010;
+				LastUpgradeCheck = 1010;
+				ORGANIZATIONNAME = Ray;
+				TargetAttributes = {
+					FCF2870421DFAEC7009A87DA = {
+						CreatedOnToolsVersion = 10.1;
+					};
+				};
+			};
+			buildConfigurationList = FCF2870021DFAEC7009A87DA /* Build configuration list for PBXProject "MobileNetDemo" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = FCF286FC21DFAEC7009A87DA;
+			productRefGroup = FCF2870621DFAEC7009A87DA /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				FCF2870421DFAEC7009A87DA /* MobileNetDemo */,
+			);
+		};
+/* End PBXProject section */
+/* Begin PBXResourcesBuildPhase section */
+		FCF2870321DFAEC7009A87DA /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FCF2871321DFAEC8009A87DA /* LaunchScreen.storyboard in Resources */,
+				FCB40E5121E0CEBB0075EC91 /* mobilenet_model in Resources */,
+				FCB40DE921E0B9410075EC91 /* banana.jpeg in Resources */,
+				FCF2871021DFAEC8009A87DA /* Assets.xcassets in Resources */,
+				FCB40E5421E0CEF80075EC91 /* synset.txt in Resources */,
+				FCB40E5221E0CEBB0075EC91 /* mobilenet_params in Resources */,
+				FCF2870E21DFAEC7009A87DA /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+/* Begin PBXShellScriptBuildPhase section */
+		1D801B9681ACFCA70D444D2C /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputFileListPaths = (
+			);
+			inputPaths = (
+				"${SRCROOT}/../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo-frameworks.sh",
+				"${BUILT_PRODUCTS_DIR}/SwiftProtobuf/SwiftProtobuf.framework",
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputFileListPaths = (
+			);
+			outputPaths = (
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/SwiftProtobuf.framework",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/../Pods/Target Support Files/Pods-MobileNetDemo/Pods-MobileNetDemo-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		B4EB56AEEFF6F3965DA3D2DA /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputFileListPaths = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputFileListPaths = (
+			);
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-MobileNetDemo-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+/* Begin PBXSourcesBuildPhase section */
+		FCF2870121DFAEC7009A87DA /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC74BB3621DFAFEC0055232B /* MobileNet.swift in Sources */,
+				FCB40DA421E0B85B0075EC91 /* MetalHelper.swift in Sources */,
+				FCB40DA221E0B7C60075EC91 /* MobilenetPreProcess.metal in Sources */,
+				FCF2870B21DFAEC7009A87DA /* ViewController.swift in Sources */,
+				FCF2870921DFAEC7009A87DA /* AppDelegate.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+/* Begin PBXVariantGroup section */
+		FCF2870C21DFAEC7009A87DA /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FCF2870D21DFAEC7009A87DA /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		FCF2871121DFAEC8009A87DA /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FCF2871221DFAEC8009A87DA /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+/* Begin XCBuildConfiguration section */
+		FCF2871521DFAEC8009A87DA /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				MTL_FAST_MATH = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		FCF2871621DFAEC8009A87DA /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 12.1;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				MTL_FAST_MATH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		FCF2871821DFAEC8009A87DA /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 4FE67FF667A24FCB0134F627 /* Pods-MobileNetDemo.debug.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				INFOPLIST_FILE = MobileNetDemo/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = Ray.MobileNetDemo;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		FCF2871921DFAEC8009A87DA /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = E57059FE3629E3A8DE6C7ECF /* Pods-MobileNetDemo.release.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				INFOPLIST_FILE = MobileNetDemo/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = Ray.MobileNetDemo;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+/* Begin XCConfigurationList section */
+		FCF2870021DFAEC7009A87DA /* Build configuration list for PBXProject "MobileNetDemo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FCF2871521DFAEC8009A87DA /* Debug */,
+				FCF2871621DFAEC8009A87DA /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		FCF2871721DFAEC8009A87DA /* Build configuration list for PBXNativeTarget "MobileNetDemo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FCF2871821DFAEC8009A87DA /* Debug */,
+				FCF2871921DFAEC8009A87DA /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = FCF286FD21DFAEC7009A87DA /* Project object */;
+}
--- a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:MobileNetDemo.xcodeproj">
+   </FileRef>
+</Workspace>
--- a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
--- a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift
+//
+//  AppDelegate.swift
+//  MobileNetDemo
+//
+//  Created by liuRuiLong on 2019/1/4.
+//  Copyright © 2019 Ray. All rights reserved.
+//
+import UIKit
+@UIApplicationMain
+class AppDelegate: UIResponder, UIApplicationDelegate {
+  var window: UIWindow?
+  func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool {
+    // Override point for customization after application launch.
+    return true
+  }
+  func applicationWillResignActive(_ application: UIApplication) {
+    // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
+    // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
+  }
+  func applicationDidEnterBackground(_ application: UIApplication) {
+    // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
+    // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
+  }
+  func applicationWillEnterForeground(_ application: UIApplication) {
+    // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
+  }
+  func applicationDidBecomeActive(_ application: UIApplication) {
+    // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
+  }
+  func applicationWillTerminate(_ application: UIApplication) {
+    // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
+  }
+}
--- a/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ b/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
--- a/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/Contents.json
+++ b/metal/MobileNetDemo/MobileNetDemo/Assets.xcassets/Contents.json
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
--- a/metal/MobileNetDemo/MobileNetDemo/Base.lproj/LaunchScreen.storyboard
+++ b/metal/MobileNetDemo/MobileNetDemo/Base.lproj/LaunchScreen.storyboard
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="53" y="375"/>
+        </scene>
+    </scenes>
+</document>
--- a/metal/MobileNetDemo/MobileNetDemo/Base.lproj/Main.storyboard
+++ b/metal/MobileNetDemo/MobileNetDemo/Base.lproj/Main.storyboard
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="hKf-0C-qAk">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="Me8-c9-Oox">
+            <objects>
+                <viewController id="hKf-0C-qAk" customClass="ViewController" customModule="MobileNetDemo" customModuleProvider="target" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Yst-rK-Wk7">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="bDP-xQ-JgS">
+                                <rect key="frame" x="0.0" y="20" width="375" height="271"/>
+                            </imageView>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="HLo-2k-dr7">
+                                <rect key="frame" x="16" y="597" width="63.5" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" secondItem="HLo-2k-dr7" secondAttribute="height" multiplier="21:10" id="xlA-qq-ubI"/>
+                                </constraints>
+                                <state key="normal" title="Image">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="selectImageAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="rJB-ZK-jTR"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="Aa7-KR-JhB">
+                                <rect key="frame" x="109.5" y="597" width="63" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <state key="normal" title="Load">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="loadAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="Lkj-aW-8vj"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="2dy-Ya-PJY">
+                                <rect key="frame" x="202.5" y="597" width="63.5" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <state key="normal" title="Predict">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="predictAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="iw4-E7-3br"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="Bac-eY-xPP">
+                                <rect key="frame" x="296" y="597" width="63" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <state key="normal" title="Clear">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="clearAct:" destination="hKf-0C-qAk" eventType="touchUpInside" id="QgH-jd-cR1"/>
+                                </connections>
+                            </button>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="vhI-WH-WKF">
+                                <rect key="frame" x="79.5" y="597" width="30" height="30"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="30" id="ffB-31-3Iy"/>
+                                    <constraint firstAttribute="height" constant="30" id="nbx-3B-EW0"/>
+                                </constraints>
+                            </view>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="ZoT-1q-tgf">
+                                <rect key="frame" x="266" y="597" width="30" height="30"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="Iu3-ig-lYv"/>
+                                    <constraint firstAttribute="width" constant="30" id="Jic-6I-7ch"/>
+                                </constraints>
+                            </view>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Zvo-dq-f6D">
+                                <rect key="frame" x="172.5" y="597" width="30" height="30"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="30" id="Zgu-c6-rPT"/>
+                                    <constraint firstAttribute="height" constant="30" id="c8V-Gd-hiK"/>
+                                </constraints>
+                            </view>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="耗时:" lineBreakMode="tailTruncation" numberOfLines="0" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="Jox-rT-ieC">
+                                <rect key="frame" x="15" y="301" width="350" height="38"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="38" id="8TB-w5-hbk"/>
+                                </constraints>
+                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" image="paddle-mobile.png" translatesAutoresizingMaskIntoConstraints="NO" id="PZO-kk-MVS">
+                                <rect key="frame" x="90" y="637" width="195" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" secondItem="PZO-kk-MVS" secondAttribute="height" multiplier="6.5:1" id="9DJ-Rj-4ex"/>
+                                </constraints>
+                            </imageView>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" text="结果:" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="efW-gP-E3g">
+                                <rect key="frame" x="10" y="347" width="355" height="150"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="150" id="whC-NW-nhZ"/>
+                                </constraints>
+                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstItem="PZO-kk-MVS" firstAttribute="centerX" secondItem="Yst-rK-Wk7" secondAttribute="centerX" id="2ET-tq-zfh"/>
+                            <constraint firstItem="Zvo-dq-f6D" firstAttribute="leading" secondItem="Aa7-KR-JhB" secondAttribute="trailing" id="368-Sl-KgC"/>
+                            <constraint firstItem="bDP-xQ-JgS" firstAttribute="top" secondItem="hlK-mk-uEU" secondAttribute="top" id="3HC-Tb-qff"/>
+                            <constraint firstItem="hlK-mk-uEU" firstAttribute="trailing" secondItem="efW-gP-E3g" secondAttribute="trailing" constant="10" id="8QW-BB-dry"/>
+                            <constraint firstItem="ZoT-1q-tgf" firstAttribute="leading" secondItem="2dy-Ya-PJY" secondAttribute="trailing" id="AhB-vB-1aW"/>
+                            <constraint firstItem="Bac-eY-xPP" firstAttribute="leading" secondItem="ZoT-1q-tgf" secondAttribute="trailing" id="BhE-d9-7Sf"/>
+                            <constraint firstItem="HLo-2k-dr7" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" constant="16" id="BuX-zw-HOG"/>
+                            <constraint firstItem="HLo-2k-dr7" firstAttribute="width" secondItem="Aa7-KR-JhB" secondAttribute="width" id="Dbs-xF-8in"/>
+                            <constraint firstItem="HLo-2k-dr7" firstAttribute="width" secondItem="Bac-eY-xPP" secondAttribute="width" id="Dov-mA-K38"/>
+                            <constraint firstItem="hlK-mk-uEU" firstAttribute="trailing" secondItem="Jox-rT-ieC" secondAttribute="trailing" constant="10" id="LfU-MA-UTb"/>
+                            <constraint firstItem="Aa7-KR-JhB" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="OMl-f7-5CL"/>
+                            <constraint firstItem="HLo-2k-dr7" firstAttribute="top" secondItem="efW-gP-E3g" secondAttribute="bottom" constant="100" id="P2f-lC-F02"/>
+                            <constraint firstItem="hlK-mk-uEU" firstAttribute="bottom" secondItem="HLo-2k-dr7" secondAttribute="bottom" constant="40" id="Po9-43-AFd"/>
+                            <constraint firstItem="bDP-xQ-JgS" firstAttribute="trailing" secondItem="hlK-mk-uEU" secondAttribute="trailing" id="Pqb-0o-qjh"/>
+                            <constraint firstItem="hlK-mk-uEU" firstAttribute="trailing" secondItem="Bac-eY-xPP" secondAttribute="trailing" constant="16" id="VOE-fl-N71"/>
+                            <constraint firstItem="vhI-WH-WKF" firstAttribute="leading" secondItem="HLo-2k-dr7" secondAttribute="trailing" id="Vlg-FW-uEQ"/>
+                            <constraint firstItem="ZoT-1q-tgf" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="Wpv-Ck-8l3"/>
+                            <constraint firstItem="efW-gP-E3g" firstAttribute="top" secondItem="Jox-rT-ieC" secondAttribute="bottom" constant="8" id="Z8f-Rs-QDZ"/>
+                            <constraint firstItem="vhI-WH-WKF" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="af8-Qd-iQN"/>
+                            <constraint firstItem="bDP-xQ-JgS" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" id="bZr-fF-a2S"/>
+                            <constraint firstItem="HLo-2k-dr7" firstAttribute="width" secondItem="2dy-Ya-PJY" secondAttribute="width" id="c0U-4X-uIO"/>
+                            <constraint firstItem="2dy-Ya-PJY" firstAttribute="leading" secondItem="Zvo-dq-f6D" secondAttribute="trailing" id="cRa-pW-xi8"/>
+                            <constraint firstItem="2dy-Ya-PJY" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="f2B-zG-wXC"/>
+                            <constraint firstItem="PZO-kk-MVS" firstAttribute="top" secondItem="HLo-2k-dr7" secondAttribute="bottom" constant="10" id="hAy-La-Eeh"/>
+                            <constraint firstItem="Zvo-dq-f6D" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="hUc-wn-Ua1"/>
+                            <constraint firstItem="Bac-eY-xPP" firstAttribute="centerY" secondItem="HLo-2k-dr7" secondAttribute="centerY" id="jDC-ag-kL6"/>
+                            <constraint firstItem="Aa7-KR-JhB" firstAttribute="leading" secondItem="vhI-WH-WKF" secondAttribute="trailing" id="jgU-OM-v1G"/>
+                            <constraint firstItem="PZO-kk-MVS" firstAttribute="bottom" secondItem="hlK-mk-uEU" secondAttribute="bottom" id="lkS-rk-Ap8"/>
+                            <constraint firstItem="efW-gP-E3g" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" constant="10" id="q2g-4E-mgJ"/>
+                            <constraint firstItem="Jox-rT-ieC" firstAttribute="top" secondItem="bDP-xQ-JgS" secondAttribute="bottom" constant="10" id="rqK-Pv-SXt"/>
+                            <constraint firstItem="Jox-rT-ieC" firstAttribute="leading" secondItem="hlK-mk-uEU" secondAttribute="leading" constant="15" id="sP3-ym-vhH"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="hlK-mk-uEU"/>
+                    </view>
+                    <connections>
+                        <outlet property="elapsedTimeLabel" destination="Jox-rT-ieC" id="QdK-sY-xmq"/>
+                        <outlet property="resultTextView" destination="efW-gP-E3g" id="Vnl-XG-D8E"/>
+                        <outlet property="selectImageView" destination="bDP-xQ-JgS" id="dMV-Wh-YsW"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="ShQ-yg-7s0" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-1558" y="-14"/>
+        </scene>
+    </scenes>
+    <resources>
+        <image name="paddle-mobile.png" width="16" height="16"/>
+    </resources>
+</document>
--- a/metal/MobileNetDemo/MobileNetDemo/Info.plist
+++ b/metal/MobileNetDemo/MobileNetDemo/Info.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>use camera</string>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
--- a/metal/paddle-mobile/paddle-mobile/MobileNet.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
@@ -13,13 +13,13 @@
 limitations under the License. */
 import Foundation
+import paddle_mobile
-class MobileNet: Net{
+public class MobileNet: Net{
  class MobilenetPreProccess: CusomKernel {
    init(device: MTLDevice) {
-      let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+      let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, usePaddleMobileLib: false)
+      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
    }
  }
@@ -43,9 +43,7 @@ class MobileNet: Net{
  let labels = PreWords.init(fileName: "synset")
  override public func resultStr(res: ResultHolder) -> String {
-    guard let resPointer = res.result else {
+    let resPointer = res.result
-      fatalError()
-    }
    var s: [String] = []
    (0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
@@ -53,18 +51,13 @@ class MobileNet: Net{
    return s.joined(separator: "\n")
  }
+  override public init(device: MTLDevice) {
-  override init(device: MTLDevice) {
    super.init(device: device)
-    means = [123.68, 116.78, 103.94]
-    scale = 0.017
    except = 0
-    modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
+    modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
-    paramPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
+    paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"
-    modelDir = ""
    preprocessKernel = MobilenetPreProccess.init(device: device)
-    dim = (n: 1, h: 224, w: 224, c: 3)
+    inputDim = Dim.init(inDim: [1, 224, 224, 3])
  }
 }
--- a/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
+++ b/metal/MobileNetDemo/MobileNetDemo/MobilenetPreProcess.metal
+//
+//  MobilenetProcess.metal
+//  MobileNetDemo
+//
+//  Created by liuRuiLong on 2019/1/5.
+//  Copyright © 2019 Ray. All rights reserved.
+//
+#include <metal_stdlib>
+using namespace metal;
+kernel void mobilenet_preprocess(
+                                 texture2d<float, access::read> inTexture [[texture(0)]],
+                                 texture2d<float, access::write> outTexture [[texture(1)]],
+                                 uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+kernel void mobilenet_preprocess_half(
+                                      texture2d<half, access::read> inTexture [[texture(0)]],
+                                      texture2d<half, access::write> outTexture [[texture(1)]],
+                                      uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
--- a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
+++ b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift
+//
+//  ViewController.swift
+//  MobileNetDemo
+//
+//  Created by liuRuiLong on 2019/1/4.
+//  Copyright © 2019 Ray. All rights reserved.
+//
+import UIKit
+import paddle_mobile
+class ViewController: UIViewController {
+  @IBOutlet weak var resultTextView: UITextView!
+  @IBOutlet weak var selectImageView: UIImageView!
+  @IBOutlet weak var elapsedTimeLabel: UILabel!
+  var net: MobileNet!
+  var runner: Runner!
+  var toPredictTexture: MTLTexture?
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    GlobalConfig.shared.computePrecision = .Float16
+    net = MobileNet.init(device: MetalHelper.shared.device)
+    runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue)
+    if let selectImage = UIImage.init(named: "banana.jpeg") {
+      selectImageView.image = selectImage
+      runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in
+        self?.toPredictTexture = texture
+      }
+    }
+  }
+  @IBAction func loadAct(_ sender: Any) {
+    if runner.load() {
+      let resutText = " load success ! "
+      print(resutText)
+      self.resultTextView.text = resutText
+    } else {
+      fatalError(" load error ")
+    }
+  }
+  @IBAction func selectImageAct(_ sender: Any) {
+    let imagePicker = UIImagePickerController()
+    imagePicker.sourceType = .camera
+    imagePicker.delegate = self
+    self.present(imagePicker, animated: true, completion: nil)
+  }
+  @IBAction func clearAct(_ sender: Any) {
+    runner.clear()
+  }
+  @IBAction func predictAct(_ sender: Any) {
+    if let texture = toPredictTexture {
+      let beginDate = Date.init()
+      runner.predict(texture: texture) { [weak self] (success, resultHolder) in
+        if success, let inResultHolder = resultHolder {
+          let timeUse = Date.init().timeIntervalSince(beginDate)
+          DispatchQueue.main.async {
+            self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms"
+            self?.resultTextView.text = self?.net.resultStr(res: inResultHolder)
+          }
+        } else {
+          print(" predict fail ")
+        }
+      }
+    } else {
+      print(" toPredictTexture is nil ")
+    }
+  }
+}
+extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
+  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+    picker.dismiss(animated: true){[weak self] in
+      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else {
+        fatalError("no image")
+      }
+      sSelf.selectImageView.image = image
+      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+        sSelf.toPredictTexture = texture
+      })
+    }
+  }
+}
--- a/metal/Podfile
+++ b/metal/Podfile
@@ -17,3 +17,9 @@ target 'paddle-mobile-unit-test' do
    project 'paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj'
    pod 'SwiftProtobuf', '~> 1.0'
 end
+target 'MobileNetDemo' do
+    project 'MobileNetDemo/MobileNetDemo.xcodeproj'
+    pod 'SwiftProtobuf', '~> 1.0'
+end
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
@@ -10,20 +10,42 @@
 		30D0ED21F392CFA3885B1002 /* Pods_paddle_mobile_demo.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */; };
 		C2CBB49021B778EA0020DC6C /* libc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD97B2140EE250073E130 /* libc++.tbd */; };
 		C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = C2E67E5D21524E460013F575 /* LoadPointerViewController.m */; };
-		FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC013927210204A3008100E3 /* PreProcessKernel.metal */; };
 		FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8120E11C550081E9F8 /* AppDelegate.swift */; };
 		FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8320E11C550081E9F8 /* ViewController.swift */; };
 		FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8520E11C550081E9F8 /* Main.storyboard */; };
 		FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8820E11C560081E9F8 /* Assets.xcassets */; };
 		FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */; };
 		FC203FB221CBFDBA00B37166 /* test.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC203FA921CBFDBA00B37166 /* test.jpg */; };
-		FC203FB321CBFDBA00B37166 /* combined_mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC203FAD21CBFDBA00B37166 /* combined_mobilenet_params */; };
+		FC2BFCBC21DF0A8600C262B2 /* 00001.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC2BFCBB21DF0A8600C262B2 /* 00001.jpg */; };
-		FC203FB421CBFDBA00B37166 /* combined_mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FC203FAE21CBFDBA00B37166 /* combined_mobilenet_model */; };
+		FC2BFCBE21DF15D900C262B2 /* 123.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC2BFCBD21DF15D900C262B2 /* 123.jpg */; };
-		FC203FB521CBFDBA00B37166 /* yolo_params in Resources */ = {isa = PBXBuildFile; fileRef = FC203FB021CBFDBA00B37166 /* yolo_params */; };
+		FC2BFCC021DF279900C262B2 /* classify-img-output.png in Resources */ = {isa = PBXBuildFile; fileRef = FC2BFCBF21DF279900C262B2 /* classify-img-output.png */; };
-		FC203FB621CBFDBA00B37166 /* yolo_model in Resources */ = {isa = PBXBuildFile; fileRef = FC203FB121CBFDBA00B37166 /* yolo_model */; };
+		FC2BFD3021DF3FEA00C262B2 /* MobilenetSSD_AR.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2A21DF3FE900C262B2 /* MobilenetSSD_AR.swift */; };
+		FC2BFD3121DF3FEA00C262B2 /* Genet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2B21DF3FE900C262B2 /* Genet.swift */; };
+		FC2BFD3221DF3FEA00C262B2 /* MobileNetSSD.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2C21DF3FE900C262B2 /* MobileNetSSD.swift */; };
+		FC2BFD3321DF3FEA00C262B2 /* YoloNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2D21DF3FE900C262B2 /* YoloNet.swift */; };
+		FC2BFD3421DF3FEA00C262B2 /* MobileNetCombined.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2E21DF3FEA00C262B2 /* MobileNetCombined.swift */; };
+		FC2BFD3521DF3FEA00C262B2 /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD2F21DF3FEA00C262B2 /* MobileNet.swift */; };
+		FC2BFD3821DF46DE00C262B2 /* OCDemoViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD3721DF46DE00C262B2 /* OCDemoViewController.m */; };
+		FC2BFD3C21DF480400C262B2 /* CPUCompute.mm in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD3B21DF480400C262B2 /* CPUCompute.mm */; };
+		FC2BFD3E21DF5CE800C262B2 /* PreProcessKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD3D21DF5CE800C262B2 /* PreProcessKernel.metal */; };
+		FC2BFD4321DF5E1E00C262B2 /* PaddleMobileGPU.m in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4021DF5E1E00C262B2 /* PaddleMobileGPU.m */; };
+		FC2BFD4421DF5E1E00C262B2 /* SuperResolutionNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4221DF5E1E00C262B2 /* SuperResolutionNet.swift */; };
+		FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */ = {isa = PBXBuildFile; fileRef = FC5E03B121DCE8D90016C137 /* mingren_input_data */; };
+		FC704C1921D2375300F98BAB /* super_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1721D2375300F98BAB /* super_params */; };
+		FC704C1A21D2375300F98BAB /* super_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1821D2375300F98BAB /* super_model */; };
+		FC704C2221D237FC00F98BAB /* combined_mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */; };
+		FC704C2321D237FC00F98BAB /* combined_mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */; };
+		FC704C2421D237FC00F98BAB /* yolo_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2021D237FC00F98BAB /* yolo_params */; };
+		FC704C2521D237FC00F98BAB /* yolo_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2121D237FC00F98BAB /* yolo_model */; };
 		FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; };
 		FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCC214D27920094B8E5 /* VideoCapture.swift */; };
+		FC9797BE21D6045B00F2FD90 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FC9797BD21D6045B00F2FD90 /* banana.jpeg */; };
+		FC9797C221D608E000F2FD90 /* mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C021D608DF00F2FD90 /* mobilenet_model */; };
+		FC9797C321D608E000F2FD90 /* mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C121D608DF00F2FD90 /* mobilenet_params */; };
+		FC9797C721D609FB00F2FD90 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C621D609FB00F2FD90 /* synset.txt */; };
+		FC9797CF21D6506F00F2FD90 /* mingren.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC9797CE21D6506F00F2FD90 /* mingren.jpg */; };
 		FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC542122EF5400D94F7E /* MetalHelper.swift */; };
+		FCCED60521D7646E00BE8D5F /* test_image_super in Resources */ = {isa = PBXBuildFile; fileRef = FCCED60421D7646E00BE8D5F /* test_image_super */; };
 		FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; };
 		FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
 		FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */; };
@@ -49,7 +71,6 @@
 		878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.debug.xcconfig"; sourceTree = "<group>"; };
 		C2E67E5C21524E460013F575 /* LoadPointerViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LoadPointerViewController.h; sourceTree = "<group>"; };
 		C2E67E5D21524E460013F575 /* LoadPointerViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = LoadPointerViewController.m; sourceTree = "<group>"; };
-		FC013927210204A3008100E3 /* PreProcessKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreProcessKernel.metal; sourceTree = "<group>"; };
 		FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-demo.app"; sourceTree = BUILT_PRODUCTS_DIR; };
 		FC039B8120E11C550081E9F8 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
 		FC039B8320E11C550081E9F8 /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
@@ -58,15 +79,41 @@
 		FC039B8B20E11C560081E9F8 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
 		FC039B8D20E11C560081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		FC203FA921CBFDBA00B37166 /* test.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test.jpg; sourceTree = "<group>"; };
-		FC203FAD21CBFDBA00B37166 /* combined_mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params; sourceTree = "<group>"; };
-		FC203FAE21CBFDBA00B37166 /* combined_mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model; sourceTree = "<group>"; };
-		FC203FB021CBFDBA00B37166 /* yolo_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_params; sourceTree = "<group>"; };
-		FC203FB121CBFDBA00B37166 /* yolo_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_model; sourceTree = "<group>"; };
 		FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "paddle-mobile-demo-Bridging-Header.h"; sourceTree = "<group>"; };
+		FC2BFCBB21DF0A8600C262B2 /* 00001.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = 00001.jpg; sourceTree = "<group>"; };
+		FC2BFCBD21DF15D900C262B2 /* 123.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = 123.jpg; sourceTree = "<group>"; };
+		FC2BFCBF21DF279900C262B2 /* classify-img-output.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "classify-img-output.png"; sourceTree = "<group>"; };
+		FC2BFD2A21DF3FE900C262B2 /* MobilenetSSD_AR.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobilenetSSD_AR.swift; sourceTree = "<group>"; };
+		FC2BFD2B21DF3FE900C262B2 /* Genet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Genet.swift; sourceTree = "<group>"; };
+		FC2BFD2C21DF3FE900C262B2 /* MobileNetSSD.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNetSSD.swift; sourceTree = "<group>"; };
+		FC2BFD2D21DF3FE900C262B2 /* YoloNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = YoloNet.swift; sourceTree = "<group>"; };
+		FC2BFD2E21DF3FEA00C262B2 /* MobileNetCombined.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNetCombined.swift; sourceTree = "<group>"; };
+		FC2BFD2F21DF3FEA00C262B2 /* MobileNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
+		FC2BFD3621DF46DE00C262B2 /* OCDemoViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = OCDemoViewController.h; sourceTree = "<group>"; };
+		FC2BFD3721DF46DE00C262B2 /* OCDemoViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = OCDemoViewController.m; sourceTree = "<group>"; };
+		FC2BFD3A21DF480300C262B2 /* CPUCompute.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CPUCompute.h; sourceTree = "<group>"; };
+		FC2BFD3B21DF480400C262B2 /* CPUCompute.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CPUCompute.mm; sourceTree = "<group>"; };
+		FC2BFD3D21DF5CE800C262B2 /* PreProcessKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PreProcessKernel.metal; sourceTree = "<group>"; };
+		FC2BFD4021DF5E1E00C262B2 /* PaddleMobileGPU.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = PaddleMobileGPU.m; sourceTree = "<group>"; };
+		FC2BFD4121DF5E1E00C262B2 /* PaddleMobileGPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileGPU.h; sourceTree = "<group>"; };
+		FC2BFD4221DF5E1E00C262B2 /* SuperResolutionNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SuperResolutionNet.swift; sourceTree = "<group>"; };
 		FC4FD97B2140EE250073E130 /* libc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libc++.tbd"; path = "usr/lib/libc++.tbd"; sourceTree = SDKROOT; };
+		FC5E03B121DCE8D90016C137 /* mingren_input_data */ = {isa = PBXFileReference; lastKnownFileType = file; path = mingren_input_data; sourceTree = "<group>"; };
+		FC704C1721D2375300F98BAB /* super_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_params; sourceTree = "<group>"; };
+		FC704C1821D2375300F98BAB /* super_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_model; sourceTree = "<group>"; };
+		FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params; sourceTree = "<group>"; };
+		FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model; sourceTree = "<group>"; };
+		FC704C2021D237FC00F98BAB /* yolo_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_params; sourceTree = "<group>"; };
+		FC704C2121D237FC00F98BAB /* yolo_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_model; sourceTree = "<group>"; };
 		FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = "<group>"; };
 		FC803BCC214D27920094B8E5 /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = "<group>"; };
+		FC9797BD21D6045B00F2FD90 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FC9797C021D608DF00F2FD90 /* mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_model; sourceTree = "<group>"; };
+		FC9797C121D608DF00F2FD90 /* mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_params; sourceTree = "<group>"; };
+		FC9797C621D609FB00F2FD90 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FC9797CE21D6506F00F2FD90 /* mingren.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = mingren.jpg; sourceTree = "<group>"; };
 		FCBCCC542122EF5400D94F7E /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalHelper.swift; sourceTree = "<group>"; };
+		FCCED60421D7646E00BE8D5F /* test_image_super */ = {isa = PBXFileReference; lastKnownFileType = file; path = test_image_super; sourceTree = "<group>"; };
 		FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultiPredictViewController.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -125,8 +172,11 @@
 		FC039B8020E11C550081E9F8 /* paddle-mobile-demo */ = {
 			isa = PBXGroup;
 			children = (
-				FC203FA821CBFDBA00B37166 /* images */,
+				FC2BFD4F21DF892500C262B2 /* Resources */,
-				FC203FAA21CBFDBA00B37166 /* models */,
+				FCBCCC542122EF5400D94F7E /* MetalHelper.swift */,
+				FC2BFD3F21DF5DDF00C262B2 /* OCInterface */,
+				FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */,
+				FC2BFD3921DF46F000C262B2 /* OCDemo */,
 				FC803BCA214D27920094B8E5 /* VideoCapture */,
 				FC8CFED2213519540094D569 /* Net */,
 				FC039B8120E11C550081E9F8 /* AppDelegate.swift */,
@@ -135,10 +185,7 @@
 				FC039B8820E11C560081E9F8 /* Assets.xcassets */,
 				FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */,
 				FC039B8D20E11C560081E9F8 /* Info.plist */,
-				FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */,
 				FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */,
-				C2E67E5C21524E460013F575 /* LoadPointerViewController.h */,
-				C2E67E5D21524E460013F575 /* LoadPointerViewController.m */,
 			);
 			path = "paddle-mobile-demo";
 			sourceTree = "<group>";
@@ -146,44 +193,90 @@
 		FC203FA821CBFDBA00B37166 /* images */ = {
 			isa = PBXGroup;
 			children = (
+				FC2BFCBF21DF279900C262B2 /* classify-img-output.png */,
+				FC2BFCBD21DF15D900C262B2 /* 123.jpg */,
+				FC2BFCBB21DF0A8600C262B2 /* 00001.jpg */,
+				FC5E03B121DCE8D90016C137 /* mingren_input_data */,
+				FCCED60421D7646E00BE8D5F /* test_image_super */,
+				FC9797CE21D6506F00F2FD90 /* mingren.jpg */,
+				FC9797BD21D6045B00F2FD90 /* banana.jpeg */,
 				FC203FA921CBFDBA00B37166 /* test.jpg */,
 			);
-			name = images;
+			path = images;
-			path = ../../images;
 			sourceTree = "<group>";
 		};
 		FC203FAA21CBFDBA00B37166 /* models */ = {
 			isa = PBXGroup;
 			children = (
-				FC203FAB21CBFDBA00B37166 /* vision_model */,
+				FC9797BF21D608DF00F2FD90 /* mobilenet */,
+				FC704C1B21D237FC00F98BAB /* vision_model */,
+				FC704C1621D2375300F98BAB /* superresoltion */,
+			);
+			path = models;
+			sourceTree = "<group>";
+		};
+		FC2BFD3921DF46F000C262B2 /* OCDemo */ = {
+			isa = PBXGroup;
+			children = (
+				C2E67E5C21524E460013F575 /* LoadPointerViewController.h */,
+				C2E67E5D21524E460013F575 /* LoadPointerViewController.m */,
+				FC2BFD3621DF46DE00C262B2 /* OCDemoViewController.h */,
+				FC2BFD3721DF46DE00C262B2 /* OCDemoViewController.m */,
+			);
+			path = OCDemo;
+			sourceTree = "<group>";
+		};
+		FC2BFD3F21DF5DDF00C262B2 /* OCInterface */ = {
+			isa = PBXGroup;
+			children = (
+				FC2BFD4121DF5E1E00C262B2 /* PaddleMobileGPU.h */,
+				FC2BFD4021DF5E1E00C262B2 /* PaddleMobileGPU.m */,
+				FC2BFD4221DF5E1E00C262B2 /* SuperResolutionNet.swift */,
 			);
-			name = models;
+			path = OCInterface;
-			path = ../../models;
 			sourceTree = "<group>";
 		};
-		FC203FAB21CBFDBA00B37166 /* vision_model */ = {
+		FC2BFD4F21DF892500C262B2 /* Resources */ = {
 			isa = PBXGroup;
 			children = (
-				FC203FAC21CBFDBA00B37166 /* mobilenet */,
+				FC203FA821CBFDBA00B37166 /* images */,
-				FC203FAF21CBFDBA00B37166 /* yolo */,
+				FC203FAA21CBFDBA00B37166 /* models */,
+			);
+			path = Resources;
+			sourceTree = "<group>";
+		};
+		FC704C1621D2375300F98BAB /* superresoltion */ = {
+			isa = PBXGroup;
+			children = (
+				FC704C1721D2375300F98BAB /* super_params */,
+				FC704C1821D2375300F98BAB /* super_model */,
+			);
+			path = superresoltion;
+			sourceTree = "<group>";
+		};
+		FC704C1B21D237FC00F98BAB /* vision_model */ = {
+			isa = PBXGroup;
+			children = (
+				FC704C1C21D237FC00F98BAB /* mobilenet */,
+				FC704C1F21D237FC00F98BAB /* yolo */,
 			);
 			path = vision_model;
 			sourceTree = "<group>";
 		};
-		FC203FAC21CBFDBA00B37166 /* mobilenet */ = {
+		FC704C1C21D237FC00F98BAB /* mobilenet */ = {
 			isa = PBXGroup;
 			children = (
-				FC203FAD21CBFDBA00B37166 /* combined_mobilenet_params */,
+				FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */,
-				FC203FAE21CBFDBA00B37166 /* combined_mobilenet_model */,
+				FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */,
 			);
 			path = mobilenet;
 			sourceTree = "<group>";
 		};
-		FC203FAF21CBFDBA00B37166 /* yolo */ = {
+		FC704C1F21D237FC00F98BAB /* yolo */ = {
 			isa = PBXGroup;
 			children = (
-				FC203FB021CBFDBA00B37166 /* yolo_params */,
+				FC704C2021D237FC00F98BAB /* yolo_params */,
-				FC203FB121CBFDBA00B37166 /* yolo_model */,
+				FC704C2121D237FC00F98BAB /* yolo_model */,
 			);
 			path = yolo;
 			sourceTree = "<group>";
@@ -200,12 +293,29 @@
 		FC8CFED2213519540094D569 /* Net */ = {
 			isa = PBXGroup;
 			children = (
-				FC013927210204A3008100E3 /* PreProcessKernel.metal */,
+				FC2BFD3A21DF480300C262B2 /* CPUCompute.h */,
-				FCBCCC542122EF5400D94F7E /* MetalHelper.swift */,
+				FC2BFD3B21DF480400C262B2 /* CPUCompute.mm */,
+				FC2BFD3D21DF5CE800C262B2 /* PreProcessKernel.metal */,
+				FC2BFD2B21DF3FE900C262B2 /* Genet.swift */,
+				FC2BFD2F21DF3FEA00C262B2 /* MobileNet.swift */,
+				FC2BFD2E21DF3FEA00C262B2 /* MobileNetCombined.swift */,
+				FC2BFD2A21DF3FE900C262B2 /* MobilenetSSD_AR.swift */,
+				FC2BFD2C21DF3FE900C262B2 /* MobileNetSSD.swift */,
+				FC2BFD2D21DF3FE900C262B2 /* YoloNet.swift */,
 			);
 			path = Net;
 			sourceTree = "<group>";
 		};
+		FC9797BF21D608DF00F2FD90 /* mobilenet */ = {
+			isa = PBXGroup;
+			children = (
+				FC9797C621D609FB00F2FD90 /* synset.txt */,
+				FC9797C021D608DF00F2FD90 /* mobilenet_model */,
+				FC9797C121D608DF00F2FD90 /* mobilenet_params */,
+			);
+			path = mobilenet;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 /* Begin PBXNativeTarget section */
@@ -268,14 +378,26 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FCCED60521D7646E00BE8D5F /* test_image_super in Resources */,
 				FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */,
-				FC203FB421CBFDBA00B37166 /* combined_mobilenet_model in Resources */,
+				FC9797CF21D6506F00F2FD90 /* mingren.jpg in Resources */,
-				FC203FB321CBFDBA00B37166 /* combined_mobilenet_params in Resources */,
+				FC704C2221D237FC00F98BAB /* combined_mobilenet_params in Resources */,
+				FC704C1921D2375300F98BAB /* super_params in Resources */,
+				FC2BFCBE21DF15D900C262B2 /* 123.jpg in Resources */,
 				FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */,
-				FC203FB521CBFDBA00B37166 /* yolo_params in Resources */,
+				FC9797C721D609FB00F2FD90 /* synset.txt in Resources */,
-				FC203FB621CBFDBA00B37166 /* yolo_model in Resources */,
+				FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */,
+				FC704C1A21D2375300F98BAB /* super_model in Resources */,
 				FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */,
+				FC9797C221D608E000F2FD90 /* mobilenet_model in Resources */,
+				FC2BFCC021DF279900C262B2 /* classify-img-output.png in Resources */,
 				FC203FB221CBFDBA00B37166 /* test.jpg in Resources */,
+				FC704C2321D237FC00F98BAB /* combined_mobilenet_model in Resources */,
+				FC9797C321D608E000F2FD90 /* mobilenet_params in Resources */,
+				FC704C2421D237FC00F98BAB /* yolo_params in Resources */,
+				FC2BFCBC21DF0A8600C262B2 /* 00001.jpg in Resources */,
+				FC9797BE21D6045B00F2FD90 /* banana.jpeg in Resources */,
+				FC704C2521D237FC00F98BAB /* yolo_model in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -325,14 +447,24 @@
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FC2BFD3221DF3FEA00C262B2 /* MobileNetSSD.swift in Sources */,
+				FC2BFD3C21DF480400C262B2 /* CPUCompute.mm in Sources */,
+				FC2BFD4321DF5E1E00C262B2 /* PaddleMobileGPU.m in Sources */,
 				FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */,
 				FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */,
-				FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */,
 				FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */,
+				FC2BFD3021DF3FEA00C262B2 /* MobilenetSSD_AR.swift in Sources */,
+				FC2BFD3321DF3FEA00C262B2 /* YoloNet.swift in Sources */,
+				FC2BFD3421DF3FEA00C262B2 /* MobileNetCombined.swift in Sources */,
 				FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */,
 				FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */,
+				FC2BFD3521DF3FEA00C262B2 /* MobileNet.swift in Sources */,
 				C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */,
+				FC2BFD3121DF3FEA00C262B2 /* Genet.swift in Sources */,
 				FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */,
+				FC2BFD4421DF5E1E00C262B2 /* SuperResolutionNet.swift in Sources */,
+				FC2BFD3E21DF5CE800C262B2 /* PreProcessKernel.metal in Sources */,
+				FC2BFD3821DF46DE00C262B2 /* OCDemoViewController.m in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};

--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcshareddata/xcschemes/paddle-mobile-demo.xcscheme
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcshareddata/xcschemes/paddle-mobile-demo.xcscheme
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1010"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
+               BuildableName = "paddle-mobile-demo.app"
+               BlueprintName = "paddle-mobile-demo"
+               ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+      <MacroExpansion>
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
+            BuildableName = "paddle-mobile-demo.app"
+            BlueprintName = "paddle-mobile-demo"
+            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
+         </BuildableReference>
+      </MacroExpansion>
+      <AdditionalOptions>
+      </AdditionalOptions>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
+            BuildableName = "paddle-mobile-demo.app"
+            BlueprintName = "paddle-mobile-demo"
+            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+      <AdditionalOptions>
+      </AdditionalOptions>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "FC039B7D20E11C550081E9F8"
+            BuildableName = "paddle-mobile-demo.app"
+            BlueprintName = "paddle-mobile-demo"
+            ReferencedContainer = "container:paddle-mobile-demo.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14113" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
    <device id="retina4_7" orientation="portrait">
        <adaptation id="fullscreen"/>
    </device>
    <dependencies>
        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14088"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
-        <capability name="Aspect ratio constraints" minToolsVersion="5.1"/>
        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
    </dependencies>
@@ -20,7 +19,7 @@
                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                        <subviews>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="TQt-X9-PdF">
-                                <rect key="frame" x="164" y="318" width="46" height="30"/>
+                                <rect key="frame" x="164.5" y="318.5" width="46" height="30"/>
                                <state key="normal" title="Button"/>
                                <connections>
                                    <action selector="predictAct:" destination="Vwd-lt-764" eventType="touchUpInside" id="d4z-Cv-6jY"/>
@@ -60,7 +59,7 @@
                                <nil key="highlightedColor"/>
                            </label>
                            <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="DlO-dk-RMr">
-                                <rect key="frame" x="55" y="510.5" width="320" height="80"/>
+                                <rect key="frame" x="55" y="510" width="320" height="80"/>
                                <constraints>
                                    <constraint firstAttribute="height" constant="80" id="Sbi-05-Mwd"/>
                                </constraints>
@@ -83,6 +82,9 @@
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="wUL-9N-u1V">
                                <rect key="frame" x="16" y="597" width="63.5" height="30"/>
                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" secondItem="wUL-9N-u1V" secondAttribute="height" multiplier="21:10" id="cp7-bd-CvU"/>
+                                </constraints>
                                <state key="normal" title="Image">
                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
                                </state>

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
-//
-//  LoadPointerViewController.h
-//  paddle-mobile-demo
-//
-//  Created by Xiao,Haichun on 2018/9/19.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-#import <UIKit/UIKit.h>
-@interface LoadPointerViewController : UIViewController
-@end
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
@@ -27,7 +27,4 @@ public class MetalHelper {
    queue = device.makeCommandQueue()!
    textureLoader = MTKTextureLoader.init(device: device)
  }
 }
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
@@ -22,10 +22,10 @@ class MultiPredictViewController: UIViewController {
    super.viewDidLoad()
    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
    let genet = Genet.init(device: MetalHelper.shared.device)
-    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue)
    let queue2 = MetalHelper.shared.device.makeCommandQueue()
-    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue)
  }
  @IBAction func predictAct(_ sender: Any) {

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/BufferToTexture.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/BufferToTexture.metal
+//
+//  RGBToYCrCb_Y.metal
+//  paddle-mobile-demo
+//
+//  Created by liuRuiLong on 2018/12/28.
+//  Copyright © 2018 orange. All rights reserved.
+//
+#include <metal_stdlib>
+using namespace metal;
+kernel void buffer_to_texture_kernel(                                     const device float *input [[buffer(0)]],
+texture2d<float, access::write> outTexture [[texture(0)]],
+uint2 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  float y = input[outTexture.get_width() * gid.y + gid.x];
+  outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
+}
+kernel void buffer_to_texture_kernel_half(                                     const device float *input [[buffer(0)]],
+                                     texture2d<half, access::write> outTexture [[texture(0)]],
+                                     uint2 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  float y = input[outTexture.get_width() * gid.y + gid.x];
+  outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
+}
--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.h
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
@@ -16,7 +16,6 @@
 #import <Foundation/Foundation.h>
 @interface CPUResult: NSObject
 @property (assign, nonatomic) float *output;
 @property (assign, nonatomic) int outputSize;

--- a/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
@@ -12,7 +12,6 @@
 See the License for the specific language governing permissions and
 limitations under the License. */
 #import "CPUCompute.h"
 #import <map>
@@ -20,9 +19,6 @@
 #import <utility>
 #import <algorithm>
 struct NMSParam {
  float *score_data;

--- a/metal/paddle-mobile/paddle-mobile/Genet.swift
+++ b/metal/paddle-mobile/paddle-mobile/Genet.swift
@@ -13,42 +13,36 @@
 limitations under the License. */
 import Foundation
+import paddle_mobile
 public class Genet: Net {
  @objc public override init(device: MTLDevice) {
    super.init(device: device)
-    means = [128.0, 128.0, 128.0]
-    scale = 0.017
-    except = 0
    modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
    paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
-    modelDir = ""
    preprocessKernel = GenetPreProccess.init(device: device)
-    dim = (n: 1, h: 128, w: 128, c: 3)
+    inputDim = Dim.init(inDim: [1, 128, 128, 3])
  }
-  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+  @objc override public init(device: MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
-    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    super.init(device: device,
-    means = [128.0, 128.0, 128.0]
+               paramPointer: paramPointer,
-    scale = 0.017
+               paramSize: paramSize,
-    except = 0
+               modePointer: modePointer,
-    modelPath = ""
+               modelSize: modelSize)
-    paramPath = ""
-    modelDir = ""
    preprocessKernel = GenetPreProccess.init(device: device)
-    dim = (n: 1, h: 128, w: 128, c: 3)
+    inputDim = Dim.init(inDim: [1, 128, 128, 3])
  }
  class GenetPreProccess: CusomKernel {
    init(device: MTLDevice) {
-      let s = CusomKernel.Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
+      let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
-      super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, usePaddleMobileLib: false)
+      super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
    }
  }
  override  public func resultStr(res: ResultHolder) -> String {
-//    fatalError()
+    return " \(res.result[0]) ... "
-    return " \(res.result![0]) ... "
  }
 }
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+import paddle_mobile
+public class MobileNet: Net{
+  class MobilenetPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
+    }
+  }
+  class PreWords {
+    var contents: [String] = []
+    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+        let string = try! String.init(contentsOfFile: filePath)
+        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+        }
+      }else{
+        fatalError("no file call \(fileName)")
+      }
+    }
+    subscript(index: Int) -> String {
+      return contents[index]
+    }
+  }
+  let labels = PreWords.init(fileName: "synset")
+  override public func resultStr(res: ResultHolder) -> String {
+    let resPointer = res.result
+    var s: [String] = []
+    (0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    }
+    return s.joined(separator: "\n")
+  }
+  override public init(device: MTLDevice) {
+    super.init(device: device)
+    except = 0
+    modelPath = Bundle.main.path(forResource: "mobilenet_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "mobilenet_params", ofType: nil) ?! "para null"    
+//    metalLoadMode = .LoadMetalInCustomMetalLib
+//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
+    preprocessKernel = MobilenetPreProccess.init(device: device)
+    inputDim = Dim.init(inDim: [1, 224, 224, 3])
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+import paddle_mobile
+public class MobileNetCombined: Net {
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    except = 0
+    modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null"
+    inputDim = Dim.init(inDim: [1, 224, 224, 3])
+//    metalLoadMode = .LoadMetalInCustomMetalLib
+//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
+  }
+  override  public func resultStr(res: ResultHolder) -> String {
+    return " \(res.result[0]) ... "
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
@@ -13,36 +13,35 @@
 limitations under the License. */
 import Foundation
+import paddle_mobile
-public class MobileNet_ssd_hand: Net{
+public class MobileNet_ssd_hand: Net {
  @objc public override init(device: MTLDevice) {
    super.init(device: device)
-    means = [123.68, 116.78, 103.94]
-    scale = 0.017
    except = 2
    modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
    paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
-    modelDir = ""
+//    metalLoadMode = .LoadMetalInCustomMetalLib
+//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    dim = (n: 1, h: 300, w: 300, c: 3)
+    inputDim = Dim.init(inDim: [1, 300, 300, 3])
  }
  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
-    means = [123.68, 116.78, 103.94]
-    scale = 0.017
    except = 2
    modelPath = ""
    paramPath = ""
-    modelDir = ""
+//    metalLoadMode = .LoadMetalInCustomMetalLib
+//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    dim = (n: 1, h: 300, w: 300, c: 3)
+    inputDim = Dim.init(inDim: [1, 300, 300, 3])
  }
  class MobilenetssdPreProccess: CusomKernel {
    init(device: MTLDevice) {
-      let s = CusomKernel.Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
+      let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, usePaddleMobileLib: false)
+      super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
    }
  }
@@ -50,7 +49,7 @@ public class MobileNet_ssd_hand: Net{
    return " \(res)"
  }
-  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+  override public func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
 //    guard let interRes = paddleMobileRes.intermediateResults else {
 //      fatalError(" need have inter result ")

--- a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
@@ -13,55 +13,49 @@
 limitations under the License. */
 import Foundation
+import paddle_mobile
-public class MobileNet_ssd_AR: Net{
+public class MobileNet_ssd_AR: Net {
  @objc public override init(device: MTLDevice) {
    super.init(device: device)
-    means = [103.94, 116.78, 123.68]
-    scale = 1
    except = 2
    modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
    paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
-    modelDir = ""
    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    dim = (n: 1, h: 160, w: 160, c: 3)
+    inputDim = Dim.init(inDim: [1, 160, 160, 3])
  }
  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
-    means = [103.94, 116.78, 123.68]
-    scale = 1
    except = 2
-    modelPath = ""
-    paramPath = ""
-    modelDir = ""
    preprocessKernel = MobilenetssdPreProccess.init(device: device)
-    dim = (n: 1, h: 160, w: 160, c: 3)
+    inputDim = Dim.init(inDim: [1, 160, 160, 3])
  }
  class MobilenetssdPreProccess: CusomKernel {
-    init(device: MTLDevice) {
+    init(device: MTLDevice)  {
-      let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
+      let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
-      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
+      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil)
    }
  }
  override public func resultStr(res: ResultHolder) -> String {
-    return " \(res.result![0])"
+    return " \(res.result[0])"
  }
-  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+  override public func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
-    guard let interRes = paddleMobileRes.intermediateResults else {
+    fatalError()
-      fatalError(" need have inter result ")
+//    guard let interRes = paddleMobileRes.intermediateResults else {
-    }
+//      fatalError(" need have inter result ")
+//    }
-    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
+//
-      fatalError(" need score ")
+//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
-    }
+//      fatalError(" need score ")
+//    }
-    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
+//
-      fatalError()
+//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
-    }
+//      fatalError()
+//    }
 //    let startDate = Date.init()
@@ -72,19 +66,19 @@ public class MobileNet_ssd_AR: Net{
 //
 //    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
-    let nmsCompute = NMSCompute.init()
+//    let nmsCompute = NMSCompute.init()
-    nmsCompute.scoreThredshold = 0.25
+//    nmsCompute.scoreThredshold = 0.25
-    nmsCompute.nmsTopK = 100
+//    nmsCompute.nmsTopK = 100
-    nmsCompute.keepTopK = 100
+//    nmsCompute.keepTopK = 100
-    nmsCompute.nmsEta = 1.0
+//    nmsCompute.nmsEta = 1.0
-    nmsCompute.nmsThreshold = 0.449999988
+//    nmsCompute.nmsThreshold = 0.449999988
-    nmsCompute.background_label = 0;
+//    nmsCompute.background_label = 0;
-    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
+//    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
-    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
+//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
-    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
+//    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
-      fatalError( " result error " )
+//      fatalError( " result error " )
-    }
+//    }
-    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+//    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
 //    for i in 0..<Int(result.outputSize) {
 //
 //      print("i \(i) : \(result.output[i])")
@@ -92,62 +86,63 @@ public class MobileNet_ssd_AR: Net{
 //    print(Date.init().timeIntervalSince(startDate))
 //    print(resultHolder.result![0])
-    return resultHolder
+//    return resultHolder
  }
-  override func updateProgram(program: Program) {
+//  override func updateProgram(program: Program) {
-    for i in [56, 66, 76, 86, 93, 99] {
-      let opDesc = program.programDesc.blocks[0].ops[i]
+//    for i in [56, 66, 76, 86, 93, 99] {
-      let output = opDesc.outputs["Out"]!.first!
+//      let opDesc = program.programDesc.blocks[0].ops[i]
-      let v = program.scope[output]!
+//      let output = opDesc.outputs["Out"]!.first!
-      let originTexture = v as! Texture<Float32>
+//      let v = program.scope[output]!
-      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
+//      let originTexture = v as! Texture
+//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
-      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
+//      
+//      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
-      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
+//      
+//      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
-      program.scope[output] = originTexture
+//      
+//      program.scope[output] = originTexture
-      if i == 99 {
+//      
-        opDesc.attrs["axis"] = 0
+//      if i == 99 {
-      } else {
+//        opDesc.attrs["axis"] = 0
-        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+//      } else {
-      }
+//        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
-    }
+//      }
+//    }
-    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
+//    
-      let opDesc = program.programDesc.blocks[0].ops[i]
+//    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
-      let output = opDesc.outputs["Out"]!.first!
+//      let opDesc = program.programDesc.blocks[0].ops[i]
-      let v = program.scope[output]!
+//      let output = opDesc.outputs["Out"]!.first!
+//      let v = program.scope[output]!
+//      
+//      
-      let originTexture = v as! Texture<Float32>
+//      
-      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+//      let originTexture = v as! Texture
-      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-    }
+//      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+//    }
-    for i in [60, 101, 90, 97, 70, 80] {
+//    
-      let opDesc = program.programDesc.blocks[0].ops[i]
+//    for i in [60, 101, 90, 97, 70, 80] {
-      let output = opDesc.outputs["Out"]!.first!
+//      let opDesc = program.programDesc.blocks[0].ops[i]
-      let v = program.scope[output]!
+//      let output = opDesc.outputs["Out"]!.first!
-      let originTexture = v as! Texture<Float32>
+//      let v = program.scope[output]!
-      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+//      let originTexture = v as! Texture
-      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+//      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-    }
+//      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+//    }
-    for i in [102] {
+//    
-      let opDesc = program.programDesc.blocks[0].ops[i]
+//    for i in [102] {
-      for output in opDesc.outputs["Out"]! {
+//      let opDesc = program.programDesc.blocks[0].ops[i]
-        let v = program.scope[output]!
+//      for output in opDesc.outputs["Out"]! {
-        let originTexture = v as! Texture<Float32>
+//        let v = program.scope[output]!
-        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+//        let originTexture = v as! Texture
-      }
+//        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
-      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+//      }
-      print(" split axis \(opDesc.attrs["axis"])")
+//      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
-    }
+//      print(" split axis \(opDesc.attrs["axis"])")
+//    }
    // 99
-  }
+//  }
 }
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
-//
-//  PaddleMobile.swift
-//  paddle-mobile-demo
-//
-//  Created by liuRuiLong on 2018/9/5.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-import Foundation
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
@@ -115,23 +115,3 @@ kernel void mobilent_ar_preprocess_half(texture2d<half, access::read> inTexture
  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }
-kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) return;
-  float w_stride = inTexture.get_width() / outTexture.get_width();
-  float h_stride = inTexture.get_height() / outTexture.get_height();
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-  outTexture.write(input, gid);
-}
-kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height()) return;
-  float w_stride = inTexture.get_width() / outTexture.get_width();
-  float h_stride = inTexture.get_height() / outTexture.get_height();
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
-  outTexture.write(half4(input), gid);
-}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Metal
+import Foundation
+import paddle_mobile
+public class YoloNet: Net {
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    except = 0
+    modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null"
+    inputDim = Dim.init(inDim: [1, 416, 416, 3])
+//    metalLoadMode = .LoadMetalInCustomMetalLib
+//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
+  }
+  override  public func resultStr(res: ResultHolder) -> String {
+    return " \(res.result[0]) ... "
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import <UIKit/UIKit.h>
+@interface LoadPointerViewController : UIViewController
+@end
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
-//
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//  LoadPointerViewController.m
-//  paddle-mobile-demo
+ Licensed under the Apache License, Version 2.0 (the "License");
-//
+ you may not use this file except in compliance with the License.
-//  Created by Xiao,Haichun on 2018/9/19.
+ You may obtain a copy of the License at
-//  Copyright © 2018年 orange. All rights reserved.
-//
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import "PaddleMobileGPU.h"
 #import "LoadPointerViewController.h"
-#import <Metal/Metal.h>
 #import "paddle-mobile-demo-Bridging-Header.h"
+#import <Metal/Metal.h>
 @interface LoadPointerViewController ()
 @property (strong, nonatomic) id<MTLDevice> device;

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import <Foundation/Foundation.h>
+NS_ASSUME_NONNULL_BEGIN
+@interface OCDemoViewController : NSObject
+@end
+NS_ASSUME_NONNULL_END
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.m
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/OCDemoViewController.m
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import "OCDemoViewController.h"
+@implementation OCDemoViewController
+@end
--- a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
@@ -16,9 +16,8 @@
 #import <Foundation/Foundation.h>
 typedef enum : NSUInteger {
-  MobileNetType,
+  SuperResolutionNetType,
-  MobileNetSSDType,
+  MobileNetSSDType
-  GenetType,
 } NetType;
 @interface PaddleMobileGPUResult: NSObject

--- a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
@@ -12,10 +12,10 @@
 See the License for the specific language governing permissions and
 limitations under the License. */
-#import <Foundation/Foundation.h>
 #import "PaddleMobileGPU.h"
-#import "paddle_mobile.h"
-#import <paddle_mobile/paddle_mobile-Swift.h>
+#import <Foundation/Foundation.h>
+#import <paddle_mobile_demo-Swift.h>
 @implementation ModelConfig
 @end
@@ -52,14 +52,12 @@
  self = [super init];
  if (self) {
    Net *net = nil;
-    if (netType == GenetType) {
+    if (netType == SuperResolutionNetType) {
-      net = [[Genet alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
+      net = [[SuperResolutionNet alloc] initWithDevice:queue.device];
    } else if (netType == MobileNetSSDType) {
      net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
-    } else if (netType == MobileNetType) {
    }
-    runner = [[Runner alloc] initInNet:net commandQueue:queue inPlatform:PlatformGPU];
+    runner = [[Runner alloc] initInNet:net commandQueue:queue];
  }
  return self;
 }
@@ -69,6 +67,7 @@
 }
 -(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSNumber *> *))completion {
  [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) {
    NSMutableArray<NSNumber *> *resultArray = [NSMutableArray arrayWithCapacity:result.capacity];
    for (int i = 0; i < result.capacity; ++i) {

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+import paddle_mobile
+@objc public class SuperResolutionNet: Net{
+  override public func resultStr(res: ResultHolder) -> String {
+    return "未实现"
+  }
+  @objc override public init(device: MTLDevice) {
+    super.init(device: device)
+    except = 0
+    modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null"
+    preprocessKernel = nil
+    inputDim = Dim.init(inDim: [1, 224, 224, 1])
+//    metalLoadMode = .LoadMetalInCustomMetalLib
+//    metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil "
+  }
+  override public func updateProgram(program: Program) {
+    // n h w c
+    for block in program.programDesc.blocks {
+      for varDesc in block.vars {
+        if !varDesc.persistable {
+          if varDesc.type == .LodTensor {
+            let varEle = program.scope.vars[varDesc.name]
+            if let texture = varEle as? Texture {
+              let newDim = Dim.init(inDim: [texture.dim[0],  inputDim[1], inputDim[2], texture.tensorDim[1]])
+              print(" var desc name " + varDesc.name + " new dim" + "\(newDim)")
+              texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim)
+              texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
+              let output: FetchHolder = program.scope.output() as! FetchHolder
+              output.dim = newDim
+              output.capacity = newDim.numel()
+              output.paddedCapacity = newDim.numel() * 4
+              output.initBuffer(device: device)
+            }
+          }
+        }
+      }
+    }
+  }
+}
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -18,31 +18,56 @@ import CoreMedia
 import paddle_mobile
 import MetalPerformanceShaders
-var platform: Platform = .GPU
+class FileReader {
-let threadSupport: [(Platform, String)] = [(.GPU, "GPU"), (.CPU, "CPU")]
+  let file: UnsafeMutablePointer<FILE>
+  let fileSize: Int
+  init(paramPath: String) throws {
+    guard let tmpFile = fopen(paramPath, "rb") else {
+      throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+    }
+    file = tmpFile
+    fseek(file, 0, SEEK_END)
+    fileSize = ftell(file)
+    guard fileSize > 0 else {
+      throw PaddleMobileError.loaderError(message: "param file size is too small")
+    }
+    rewind(file)
+  }
+  func read<T>() -> UnsafeMutablePointer<T> {
+    let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size * fileSize)
+    fread(ptr, fileSize, 1, file)
+    return ptr
+  }
+  deinit {
+    fclose(file)
+  }
+}
-//.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+enum Platform {
-let modelHelperMap: [SupportModel : Runner] = [
+  case GPU
-                                               .yolo : Runner.init(inNet: YoloNet.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+}
-                                               .mobilenet_combined : Runner.init(inNet: MobileNetCombined.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform)]
-//, .genet : Genet.init()
-//let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
-let netSupport: [SupportModel : Net] = [.yolo : YoloNet.init(device: MetalHelper.shared.device), .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device)]
+let platformSupport: [(Platform, String)] = [(.GPU, "GPU")]
 enum SupportModel: String{
-  //  case mobilenet = "mobilenet"
+  case yolo               = "yolo"
-//  case mobilenet_ssd    = "mobilenetssd"
-  case yolo            = "yolo"
  case mobilenet_combined = "mobilenet_combined"
+  case super_resolution   = "superresoltion"
+  case mobilenet          = "mobilenet"
  static func supportedModels() -> [SupportModel] {
-    // .mobilenet,
+    return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet]
-    // .mobilenet_ssd,
-    return [.yolo, .mobilenet_combined]
  }
 }
+let netSupport: [SupportModel : Net] = [
+  .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device),
+  .yolo : YoloNet.init(device: MetalHelper.shared.device),
+  .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device),
+  .mobilenet : MobileNet.init(device: MetalHelper.shared.device)]
 class ViewController: UIViewController {
  @IBOutlet weak var resultTextView: UITextView!
  @IBOutlet weak var selectImageView: UIImageView!
@@ -50,28 +75,37 @@ class ViewController: UIViewController {
  @IBOutlet weak var modelPickerView: UIPickerView!
  @IBOutlet weak var threadPickerView: UIPickerView!
  @IBOutlet weak var videoView: UIView!
-//  var videoCapture: VideoCapture!
+  //  var videoCapture: VideoCapture!
  var selectImage: UIImage?
  var inputPointer: UnsafeMutablePointer<Float32>?
  var modelType: SupportModel = SupportModel.supportedModels()[0]
  var toPredictTexture: MTLTexture?
  var runner: Runner!
+  var platform: Platform = .GPU
  var threadNum = 1
  @IBAction func loadAct(_ sender: Any) {
-     runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue, inPlatform: platform)
+    runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue)
+    if platform == .GPU {
-    if platform == .CPU {
+//      let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil)
-      if inputPointer == nil {
+//      let fileReader = try! FileReader.init(paramPath: filePath!)
-        inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+//      let pointer: UnsafeMutablePointer<Float32> = fileReader.read()
+//      
-      }
+//      
-    } else if platform == .GPU {
+//      let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared)
+//      
+//      buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize)
      if self.toPredictTexture == nil {
-        runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+//        runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in
+//          self?.toPredictTexture = texture
+//        }
+        runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in
          self?.toPredictTexture = texture
        }
      }
@@ -106,27 +140,21 @@ class ViewController: UIViewController {
        return
      }
-//      for _ in 0..<1{
-//        runner.predict(texture: inTexture) { (success, resultHolder)  in
-//          resultHolder?.releasePointer()
-//        }
-//      }
      let startDate = Date.init()
      for i in 0..<max {
-        runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
+        self.runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
          guard let sSelf = self else {
            fatalError()
          }
-          if success {
+          if success, let inResultHolder = resultHolder {
            if i == max - 1 {
              let time = Date.init().timeIntervalSince(startDate)
+              print(inResultHolder.result.floatArr(count: inResultHolder.capacity).strideArray())
              DispatchQueue.main.async {
-//                print(resultHolder!.result![0])
                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
              }
            }
          }
@@ -134,39 +162,6 @@ class ViewController: UIViewController {
          DispatchQueue.main.async {
            resultHolder?.releasePointer()
          }
-//            print("释放")
-        }
-//        print("sleep before ")
-//        usleep(33000)
-//        print("sleep after ")
-      }
-    case .CPU:
-      guard let inInputPointer = inputPointer else {
-        fatalError( " need input pointer " )
-      }
-      for _ in 0..<10 {
-        runner.predict(inputPointer: inInputPointer) { (success, res) in
-          res?.releaseOutput()
-        }
-      }
-      let startDate = Date.init()
-      for i in 0..<max {
-        runner.predict(inputPointer: inInputPointer) { [weak self](success, res) in
-          guard let sSelf = self else {
-            fatalError()
-          }
-          if success {
-            if i == max - 1 {
-              let time = Date.init().timeIntervalSince(startDate)
-              DispatchQueue.main.async {
-//                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
-                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
-              }
-            }
-          }
-          res?.releaseOutput()
        }
      }
    }
@@ -179,37 +174,38 @@ class ViewController: UIViewController {
    modelPickerView.dataSource = self
    threadPickerView.delegate = self
    threadPickerView.dataSource = self
-    if let image = UIImage.init(named: "test.jpg") {
+    if let image = UIImage.init(named: "classify-img-output.png") {
-        selectImage = image
+      selectImage = image
-        selectImageView.image = image
+      selectImageView.image = image
    } else {
-        print("请添加测试图片")
+      print("请添加测试图片")
    }
+    GlobalConfig.shared.computePrecision = .Float32
-//    if platform == .CPU {
+    //    if platform == .CPU {
-//      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+    //      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
-//    } else if platform == .GPU {
+    //    } else if platform == .GPU {
-//      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+    //      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
-//        self?.toPredictTexture = texture
+    //        self?.toPredictTexture = texture
-//      }
+    //      }
-//    } else {
+    //    } else {
-//      fatalError( " unsupport " )
+    //      fatalError( " unsupport " )
-//    }
+    //    }
+    //    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
+    //    videoCapture.fps = 30
+    //    videoCapture.delegate = self
+    //    videoCapture.setUp { (success) in
+    //      DispatchQueue.main.async {
+    //        if let preViewLayer = self.videoCapture.previewLayer {
+    //          self.videoView.layer.addSublayer(preViewLayer)
+    //          self.videoCapture.previewLayer?.frame = self.videoView.bounds
+    //        }
+    //        self.videoCapture.start()
+    //      }
+    //    }
-//    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
-//    videoCapture.fps = 30
-//    videoCapture.delegate = self
-//    videoCapture.setUp { (success) in
-//      DispatchQueue.main.async {
-//        if let preViewLayer = self.videoCapture.previewLayer {
-//          self.videoView.layer.addSublayer(preViewLayer)
-//          self.videoCapture.previewLayer?.frame = self.videoView.bounds
-//        }
-//        self.videoCapture.start()
-//      }
-//    }
  }
 }
@@ -228,7 +224,7 @@ extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
    if pickerView == modelPickerView {
      return SupportModel.supportedModels().count
    } else if pickerView == threadPickerView {
-      return threadSupport.count
+      return platformSupport.count
    } else {
      fatalError()
    }
@@ -238,7 +234,7 @@ extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
    if pickerView == modelPickerView {
      return SupportModel.supportedModels()[row].rawValue
    } else if pickerView == threadPickerView {
-      return threadSupport[row].1
+      return platformSupport[row].1
    } else {
      fatalError()
    }
@@ -248,8 +244,7 @@ extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
    if pickerView == modelPickerView {
      self.modelType = SupportModel.supportedModels()[row]
    } else if pickerView == threadPickerView {
+      platform = platformSupport[row].0
-      platform = threadSupport[row].0
    } else {
      fatalError()
    }
@@ -276,25 +271,11 @@ extension ViewController: VideoCaptureDelegate{
  func predictTexture(texture: MTLTexture){
    runner.scaleTexture(input: texture) { (scaledTexture) in
      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
-//        print(resultHolder!.result![0])
+        //        print(resultHolder!.result![0])
        resultHolder?.releasePointer()
      })
    }
  }
-//  @available(iOS 10.0, *)
-//  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
-////    if !bool1 {
-////      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
-//    self.predictTexture(texture: texture!)
-////      }
-//
-//
-////      bool1 = true
-////    }
-//
-//  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.inc.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT2(a, b) a ## b
+#define FUNC(m, n, q) CONCAT3_(m, n, q)
+#define FUNC_T(m, n) CONCAT2_(m, n)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                  device float *output [[buffer(0)]],
+                  uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+}
+#endif
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/FetchKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+#define P float
+#include "FetchKernel.inc.metal"
+#undef P
+#define P half
+#include "FetchKernel.inc.metal"
+#undef P
+kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+}
+kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                                   device float *output [[buffer(0)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
@@ -24,6 +24,6 @@ using namespace metal;
 #define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
 #define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC2_(a, b) CONCAT2_(a, b)
 #define FUNC3_(a, b, c) CONCAT3_(a, b, c)
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.inc.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.inc.metal
+//
+//  PoolKernel.inc.metal
+//  paddle-mobile
+//
+//  Created by liuRuiLong on 2018/12/29.
+//  Copyright © 2018 orange. All rights reserved.
+//
+#ifdef P
+kernel void FUNC2_(pool, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                 texture2d_array<P, access::write> outTexture [[texture(1)]],
+                 constant PoolParam &pm [[buffer(0)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  VECTOR(P, 4) r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= (xmax - xmin) * (ymax - ymin);
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/metal/PoolKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Macro.metal"
+using namespace metal;
+struct PoolParam {
+  int ksizeX;
+  int ksizeY;
+  int strideX;
+  int strideY;
+  int paddingX;
+  int paddingY;
+  int poolType;
+};
+#define P float
+#import "PoolKernel.inc.metal"
+#undef P
+#define P half
+#import "PoolKernel.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -16,7 +16,6 @@
 		4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA91214665D700D0F791 /* ShapeOp.swift */; };
 		4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA932146661500D0F791 /* ShapeKernel.swift */; };
 		4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA972146666500D0F791 /* FlattenOp.swift */; };
-		4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */ = {isa = PBXBuildFile; fileRef = 4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */; };
 		4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */; };
 		4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */; };
 		4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */; };
@@ -29,8 +28,6 @@
 		4AF9287921341661005B6C3A /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9287821341661005B6C3A /* Softmax.metal */; };
 		4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* ConcatKernel.metal */; };
 		4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9288321357BE3005B6C3A /* Elementwise.metal */; };
-		C28FDF8421B7858F0054EFAC /* MobileNetCombined.swift in Sources */ = {isa = PBXBuildFile; fileRef = C28FDF8221B7858F0054EFAC /* MobileNetCombined.swift */; };
-		C28FDF8521B7858F0054EFAC /* YoloNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = C28FDF8321B7858F0054EFAC /* YoloNet.swift */; };
 		C28FE02F21BA68C00054EFAC /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02C21BA68C00054EFAC /* Metal.framework */; };
 		C28FE03021BA68C00054EFAC /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02D21BA68C00054EFAC /* MetalPerformanceShaders.framework */; };
 		C28FE03121BA68C00054EFAC /* MetalKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02E21BA68C00054EFAC /* MetalKit.framework */; };
@@ -64,19 +61,15 @@
 		FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; };
 		FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
 		FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; };
-		FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC1CF3F721D4B4C400F7392E /* Runner.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1CF3F621D4B4C400F7392E /* Runner.swift */; };
-		FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */ = {isa = PBXBuildFile; fileRef = FC292C5521421B4600CF622F /* PaddleMobileGPU.m */; };
+		FC2BFCC221DF2F9100C262B2 /* GlobalConfig.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */; };
-		FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7C214255BC00CF622F /* CPUCompute.mm */; };
+		FC2BFD4621DF685F00C262B2 /* Scale.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4521DF685F00C262B2 /* Scale.swift */; };
-		FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7E214255BC00CF622F /* MobileNetSSD.swift */; };
+		FC2BFD4A21DF81DE00C262B2 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4921DF81DE00C262B2 /* Kernel.swift */; };
-		FC292C85214257CB00CF622F /* CPUCompute.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C7D214255BC00CF622F /* CPUCompute.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC2BFD4E21DF820B00C262B2 /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */; };
-		FC292C872142624800CF622F /* Genet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C862142624800CF622F /* Genet.swift */; };
+		FC2BFD5121DF8E0400C262B2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD5021DF8E0400C262B2 /* Scale.metal */; };
-		FC33B0F02147659000714A93 /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC33B0EF2147659000714A93 /* MobileNet.swift */; };
 		FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
 		FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
 		FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
-		FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */; };
-		FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
-		FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD9782140E4980073E130 /* libpaddle-mobile.a */; };
 		FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; };
 		FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC60DB8820E9AAA500FF203F /* MetalExtension.swift */; };
 		FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */; };
@@ -86,7 +79,9 @@
 		FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC6214CBA820094B8E5 /* Macro.metal */; };
 		FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */; };
 		FC82735920E3C04200BE430A /* OpCreator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC82735820E3C04200BE430A /* OpCreator.swift */; };
-		FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */; };
+		FC9797C921D6101D00F2FD90 /* ResizeBilinearOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */; };
+		FC9797CB21D6102D00F2FD90 /* ResizeBilinearKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */; };
+		FC9C2A0D21D3D185005856C6 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC9C2A0C21D3D185005856C6 /* FetchKernel.inc.metal */; };
 		FC9D037920E229E4000F735A /* OpParam.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037820E229E4000F735A /* OpParam.swift */; };
 		FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037F20E22FBB000F735A /* FeedOp.swift */; };
 		FC9D038220E2312E000F735A /* FetchOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038120E2312E000F735A /* FetchOp.swift */; };
@@ -97,6 +92,7 @@
 		FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD42138272900BD58AA /* ConvAddMetal.metal */; };
 		FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */; };
 		FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */; };
+		FCB40E5921E0DCAB0075EC91 /* FetchKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */; };
 		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
 		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
 		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
@@ -111,6 +107,7 @@
 		FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */; };
 		FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */; };
 		FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */; };
+		FCCED5E121D71FC000BE8D5F /* PoolKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCCED5E021D71FC000BE8D5F /* PoolKernel.inc.metal */; };
 		FCD04E6620F314C50007374F /* PoolOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6520F314C50007374F /* PoolOp.swift */; };
 		FCD04E6820F315020007374F /* PoolKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6720F315020007374F /* PoolKernel.swift */; };
 		FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6920F319EC0007374F /* SoftmaxOp.swift */; };
@@ -136,9 +133,7 @@
 		FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */; };
 		FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; };
 		FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; };
-		FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */; };
 		FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */; };
-		FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2D73720E64E70007AC5F5 /* Kernel.swift */; };
 /* End PBXBuildFile section */
 /* Begin PBXFileReference section */
@@ -164,8 +159,6 @@
 		4AF9287821341661005B6C3A /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = "<group>"; };
 		4AF928812135673D005B6C3A /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = "<group>"; };
 		4AF9288321357BE3005B6C3A /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
-		C28FDF8221B7858F0054EFAC /* MobileNetCombined.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNetCombined.swift; sourceTree = "<group>"; };
-		C28FDF8321B7858F0054EFAC /* YoloNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = YoloNet.swift; sourceTree = "<group>"; };
 		C28FE02C21BA68C00054EFAC /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
 		C28FE02D21BA68C00054EFAC /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = SDKROOT; };
 		C28FE02E21BA68C00054EFAC /* MetalKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalKit.framework; path = System/Library/Frameworks/MetalKit.framework; sourceTree = SDKROOT; };
@@ -203,19 +196,15 @@
 		FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; };
 		FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; };
 		FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; };
-		FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileGPU.h; sourceTree = "<group>"; };
+		FC1CF3F621D4B4C400F7392E /* Runner.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Runner.swift; sourceTree = "<group>"; };
-		FC292C5521421B4600CF622F /* PaddleMobileGPU.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PaddleMobileGPU.m; sourceTree = "<group>"; };
+		FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GlobalConfig.swift; sourceTree = "<group>"; };
-		FC292C7C214255BC00CF622F /* CPUCompute.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = CPUCompute.mm; sourceTree = "<group>"; };
+		FC2BFD4521DF685F00C262B2 /* Scale.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Scale.swift; sourceTree = "<group>"; };
-		FC292C7D214255BC00CF622F /* CPUCompute.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CPUCompute.h; sourceTree = "<group>"; };
+		FC2BFD4921DF81DE00C262B2 /* Kernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Kernel.swift; sourceTree = "<group>"; };
-		FC292C7E214255BC00CF622F /* MobileNetSSD.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileNetSSD.swift; sourceTree = "<group>"; };
+		FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluOp.swift; sourceTree = "<group>"; };
-		FC292C862142624800CF622F /* Genet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Genet.swift; sourceTree = "<group>"; };
+		FC2BFD5021DF8E0400C262B2 /* Scale.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = "<group>"; };
-		FC33B0EF2147659000714A93 /* MobileNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
 		FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; };
 		FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
 		FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProgramOptimize.swift; sourceTree = "<group>"; };
-		FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PaddleMobile.swift; sourceTree = "<group>"; };
-		FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileCPU.h; sourceTree = "<group>"; };
-		FC4FD9782140E4980073E130 /* libpaddle-mobile.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = "libpaddle-mobile.a"; sourceTree = "<group>"; };
 		FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
 		FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = "<group>"; };
 		FC60DB8820E9AAA500FF203F /* MetalExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalExtension.swift; sourceTree = "<group>"; };
@@ -226,7 +215,9 @@
 		FC803BC6214CBA820094B8E5 /* Macro.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = "<group>"; };
 		FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = "<group>"; };
 		FC82735820E3C04200BE430A /* OpCreator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpCreator.swift; sourceTree = "<group>"; };
-		FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobilenetSSD_AR.swift; sourceTree = "<group>"; };
+		FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ResizeBilinearOp.swift; sourceTree = "<group>"; };
+		FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ResizeBilinearKernel.swift; sourceTree = "<group>"; };
+		FC9C2A0C21D3D185005856C6 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = "<group>"; };
 		FC9D037820E229E4000F735A /* OpParam.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpParam.swift; sourceTree = "<group>"; };
 		FC9D037F20E22FBB000F735A /* FeedOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeedOp.swift; sourceTree = "<group>"; };
 		FC9D038120E2312E000F735A /* FetchOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchOp.swift; sourceTree = "<group>"; };
@@ -237,6 +228,7 @@
 		FCA67CD42138272900BD58AA /* ConvAddMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
 		FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = "<group>"; };
 		FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = "<group>"; };
+		FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchKernel.swift; sourceTree = "<group>"; };
 		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
 		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
 		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
@@ -251,6 +243,7 @@
 		FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderKernel.swift; sourceTree = "<group>"; };
 		FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSOp.swift; sourceTree = "<group>"; };
 		FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSKernel.swift; sourceTree = "<group>"; };
+		FCCED5E021D71FC000BE8D5F /* PoolKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PoolKernel.inc.metal; sourceTree = "<group>"; };
 		FCD04E6520F314C50007374F /* PoolOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolOp.swift; sourceTree = "<group>"; };
 		FCD04E6720F315020007374F /* PoolKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolKernel.swift; sourceTree = "<group>"; };
 		FCD04E6920F319EC0007374F /* SoftmaxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxOp.swift; sourceTree = "<group>"; };
@@ -276,9 +269,7 @@
 		FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = "<group>"; };
 		FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
 		FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = "<group>"; };
-		FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = ConvAddBatchNormReluOp.swift; path = "paddle-mobile/Operators/ConvAddBatchNormReluOp.swift"; sourceTree = SOURCE_ROOT; };
 		FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluKernel.swift; sourceTree = "<group>"; };
-		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Base/Kernel.swift"; sourceTree = SOURCE_ROOT; };
 /* End PBXFileReference section */
 /* Begin PBXFrameworksBuildPhase section */
@@ -290,7 +281,6 @@
 				C28FE03021BA68C00054EFAC /* MetalPerformanceShaders.framework in Frameworks */,
 				C28FE03121BA68C00054EFAC /* MetalKit.framework in Frameworks */,
 				D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */,
-				FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -339,23 +329,8 @@
 		FC039B6C20E11C3C0081E9F8 /* paddle-mobile */ = {
 			isa = PBXGroup;
 			children = (
-				C28FDF8221B7858F0054EFAC /* MobileNetCombined.swift */,
+				FC2BFD4721DF818000C262B2 /* API */,
-				C28FDF8321B7858F0054EFAC /* YoloNet.swift */,
+				FC2BFD4821DF818000C262B2 /* Src */,
-				FCE9D7B6214F869000B520C3 /* Net.swift */,
-				FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */,
-				FC33B0EF2147659000714A93 /* MobileNet.swift */,
-				FC292C862142624800CF622F /* Genet.swift */,
-				FC292C7E214255BC00CF622F /* MobileNetSSD.swift */,
-				FC292C7C214255BC00CF622F /* CPUCompute.mm */,
-				FC292C7D214255BC00CF622F /* CPUCompute.h */,
-				FC292C5521421B4600CF622F /* PaddleMobileGPU.m */,
-				FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */,
-				FC4FD9762140E4920073E130 /* CPU */,
-				FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */,
-				FC039BAE20E11CC20081E9F8 /* Program */,
-				FC039BA320E11CBC0081E9F8 /* Operators */,
-				FC039B9C20E11CB20081E9F8 /* framework */,
-				FC039B9320E11C9A0081E9F8 /* Common */,
 				FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */,
 				FC039B6E20E11C3C0081E9F8 /* Info.plist */,
 			);
@@ -375,7 +350,7 @@
 			path = Common;
 			sourceTree = "<group>";
 		};
-		FC039B9C20E11CB20081E9F8 /* framework */ = {
+		FC039B9C20E11CB20081E9F8 /* Framework */ = {
 			isa = PBXGroup;
 			children = (
 				FC039BA120E11CB70081E9F8 /* Loader.swift */,
@@ -384,7 +359,7 @@
 				FC039B9E20E11CB20081E9F8 /* Dim.swift */,
 				FC9D038320E23B01000F735A /* Texture.swift */,
 			);
-			path = framework;
+			path = Framework;
 			sourceTree = "<group>";
 		};
 		FC039BA320E11CBC0081E9F8 /* Operators */ = {
@@ -392,7 +367,8 @@
 			children = (
 				FC086BA520E67E8500D85EF7 /* Kernels */,
 				FCD592FA20E248EC00252966 /* Base */,
-				FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */,
+				FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */,
+				FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */,
 				FC039BA420E11CBC0081E9F8 /* ConvOp.swift */,
 				FC039BA520E11CBC0081E9F8 /* ElementwiseAddOp.swift */,
 				FC039BA720E11CBC0081E9F8 /* BatchNormOp.swift */,
@@ -446,6 +422,7 @@
 			children = (
 				FCDDC6CD212FE02100E5EF74 /* Base */,
 				FCEB6837212F00B100D2448E /* metal */,
+				FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */,
 				FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
 				FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
 				FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
@@ -471,17 +448,31 @@
 				FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */,
 				FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */,
 				FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
+				FC2BFD4521DF685F00C262B2 /* Scale.swift */,
+				FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
 		};
-		FC4FD9762140E4920073E130 /* CPU */ = {
+		FC2BFD4721DF818000C262B2 /* API */ = {
 			isa = PBXGroup;
 			children = (
-				FC4FD9782140E4980073E130 /* libpaddle-mobile.a */,
+				FCE9D7B6214F869000B520C3 /* Net.swift */,
-				FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */,
+				FC1CF3F621D4B4C400F7392E /* Runner.swift */,
+				FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */,
+			);
+			path = API;
+			sourceTree = "<group>";
+		};
+		FC2BFD4821DF818000C262B2 /* Src */ = {
+			isa = PBXGroup;
+			children = (
+				FC039BAE20E11CC20081E9F8 /* Program */,
+				FC039BA320E11CBC0081E9F8 /* Operators */,
+				FC039B9C20E11CB20081E9F8 /* Framework */,
+				FC039B9320E11C9A0081E9F8 /* Common */,
 			);
-			path = CPU;
+			path = Src;
 			sourceTree = "<group>";
 		};
 		FCD592FA20E248EC00252966 /* Base */ = {
@@ -497,7 +488,7 @@
 		FCDDC6CD212FE02100E5EF74 /* Base */ = {
 			isa = PBXGroup;
 			children = (
-				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
+				FC2BFD4921DF81DE00C262B2 /* Kernel.swift */,
 			);
 			path = Base;
 			sourceTree = "<group>";
@@ -533,13 +524,16 @@
 				FC0226552138F33800F395E2 /* TransposeKernel.metal */,
 				4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */,
 				FC0226572138F38D00F395E2 /* PoolKernel.metal */,
+				FCCED5E021D71FC000BE8D5F /* PoolKernel.inc.metal */,
 				FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */,
 				FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */,
 				FC803BC6214CBA820094B8E5 /* Macro.metal */,
 				FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */,
+				FC9C2A0C21D3D185005856C6 /* FetchKernel.inc.metal */,
 				FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */,
 				FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */,
 				FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */,
+				FC2BFD5021DF8E0400C262B2 /* Scale.metal */,
 			);
 			path = metal;
 			sourceTree = "<group>";
@@ -551,10 +545,6 @@
 			isa = PBXHeadersBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */,
-				FC292C85214257CB00CF622F /* CPUCompute.h in Headers */,
-				FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */,
-				4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */,
 				FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -650,6 +640,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
+				FC9C2A0D21D3D185005856C6 /* FetchKernel.inc.metal in Sources */,
 				4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */,
 				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
 				FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */,
@@ -672,20 +663,17 @@
 				FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */,
 				FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
 				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
-				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
 				FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
 				FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
 				FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
+				FC9797CB21D6102D00F2FD90 /* ResizeBilinearKernel.swift in Sources */,
 				FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */,
 				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
 				4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */,
-				FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */,
-				FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
 				4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
 				4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */,
 				FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
-				FC33B0F02147659000714A93 /* MobileNet.swift in Sources */,
 				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
 				4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */,
 				FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */,
@@ -697,17 +685,19 @@
 				FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */,
 				FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */,
 				FC039BB820E11CC20081E9F8 /* framework.pb.swift in Sources */,
-				C28FDF8521B7858F0054EFAC /* YoloNet.swift in Sources */,
 				FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */,
 				FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */,
 				FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */,
-				FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */,
 				FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
 				FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
+				FCCED5E121D71FC000BE8D5F /* PoolKernel.inc.metal in Sources */,
+				FC2BFD4A21DF81DE00C262B2 /* Kernel.swift in Sources */,
 				FC9D038420E23B01000F735A /* Texture.swift in Sources */,
 				FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */,
+				FC2BFD4E21DF820B00C262B2 /* ConvAddBatchNormReluOp.swift in Sources */,
 				4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */,
 				4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */,
+				FC2BFCC221DF2F9100C262B2 /* GlobalConfig.swift in Sources */,
 				FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
 				4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */,
 				FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */,
@@ -715,14 +705,13 @@
 				FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */,
 				4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */,
 				FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */,
+				FC1CF3F721D4B4C400F7392E /* Runner.swift in Sources */,
 				FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */,
-				FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */,
 				FCD04E6620F314C50007374F /* PoolOp.swift in Sources */,
 				FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */,
 				FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */,
 				FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
 				FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */,
-				FC292C872142624800CF622F /* Genet.swift in Sources */,
 				FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */,
 				4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */,
 				FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
@@ -731,6 +720,7 @@
 				FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */,
 				4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */,
 				FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */,
+				FCB40E5921E0DCAB0075EC91 /* FetchKernel.swift in Sources */,
 				FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */,
 				FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */,
 				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
@@ -742,20 +732,18 @@
 				FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
 				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
 				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
-				C28FDF8421B7858F0054EFAC /* MobileNetCombined.swift in Sources */,
 				FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */,
-				FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */,
 				FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */,
 				FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */,
 				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
 				FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */,
 				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
+				FC2BFD5121DF8E0400C262B2 /* Scale.metal in Sources */,
 				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
 				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
 				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
 				FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */,
 				4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */,
-				FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */,
 				FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */,
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
 				FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */,
@@ -769,7 +757,9 @@
 				FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
 				FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
 				FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */,
+				FC9797C921D6101D00F2FD90 /* ResizeBilinearOp.swift in Sources */,
 				4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */,
+				FC2BFD4621DF685F00C262B2 /* Scale.swift in Sources */,
 				FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -909,7 +899,7 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_BITCODE = NO;
+				ENABLE_BITCODE = YES;
 				INFOPLIST_FILE = "paddle-mobile/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
@@ -946,7 +936,7 @@
 				DYLIB_COMPATIBILITY_VERSION = 1;
 				DYLIB_CURRENT_VERSION = 1;
 				DYLIB_INSTALL_NAME_BASE = "@rpath";
-				ENABLE_BITCODE = NO;
+				ENABLE_BITCODE = YES;
 				INFOPLIST_FILE = "paddle-mobile/Info.plist";
 				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/xcshareddata/xcschemes/paddle-mobile.xcscheme
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/xcshareddata/xcschemes/paddle-mobile.xcscheme
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1010"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
+               BuildableName = "paddle_mobile.framework"
+               BlueprintName = "paddle-mobile"
+               ReferencedContainer = "container:paddle-mobile.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+      <AdditionalOptions>
+      </AdditionalOptions>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <MacroExpansion>
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
+            BuildableName = "paddle_mobile.framework"
+            BlueprintName = "paddle-mobile"
+            ReferencedContainer = "container:paddle-mobile.xcodeproj">
+         </BuildableReference>
+      </MacroExpansion>
+      <AdditionalOptions>
+      </AdditionalOptions>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <MacroExpansion>
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "FC039B6920E11C3C0081E9F8"
+            BuildableName = "paddle_mobile.framework"
+            BlueprintName = "paddle-mobile"
+            ReferencedContainer = "container:paddle-mobile.xcodeproj">
+         </BuildableReference>
+      </MacroExpansion>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
--- a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+@objc public enum MetalLoadMode: Int {
+  case
+  LoadMetalInPaddleMobile   = 1,     // 使用 paddle-mobile 中的 metal 代码
+  LoadMetalInDefaultLib     = 2,     // 使用 main bundle 中的 metal 代码
+  LoadMetalInCustomMetalLib = 3      // 使用 metal 库文件
+}
+@objc public enum ComputePrecision: Int {
+  case
+  Float32 = 1,
+  Float16 = 2
+}
+@objc public class GlobalConfig: NSObject {
+  /// 单例
+  @objc public static let shared: GlobalConfig = GlobalConfig.init()
+  /// 运算精度， runner 生命周期中不可变
+  @objc public var computePrecision: ComputePrecision = .Float16
+}
--- a/metal/paddle-mobile/paddle-mobile/API/Net.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/Net.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Metal
+import Foundation
+/// 网络的基类， 参数已经给了默认值，请在子类实现中修改需要改的参数
+@objc open class Net: NSObject {
+  /// 默认为0， 如果指定个数， 后边 except 个op不使用 GPU 运算， 中间结果会通过 fetchResult 传参过来
+  @objc public var except: Int = 0
+  /// 预处理 kernel， 如果输入图像需要预处理， 则指定预处理 kernel
+  @objc public var preprocessKernel: CusomKernel? = nil
+  // 以下四个参数为从内存中读取模型时用到的参数
+  /// 模型在内存中的指针
+  @objc public var modelPointer: UnsafeMutableRawPointer? = nil
+  /// 模型大小 单位： 字节
+  @objc public var modelSize: Int = 0
+  /// 权重参数在内存中的指针
+  @objc public var paramPointer: UnsafeMutableRawPointer? = nil
+  /// 权重大小 单位： 字节
+  @objc public var paramSize: Int = 0
+  // 以下两个为从文件中读取模型时用到的参数
+  /// 模型文件路径
+  @objc public var modelPath: String? = nil
+  /// 权重文件路径
+  @objc public var paramPath: String? = nil
+  /// 代表着 GPU 处理器
+  @objc public let device: MTLDevice
+  /// metal 代码加载方式 注意： 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码
+  @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile
+  /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
+  @objc public var metalLibPath: String? = nil
+  /// 输入维度，按照 n h w c 方式传入
+  @objc public var inputDim: Dim = Dim.init(inDim: [])
+  @objc public init(device: MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    self.paramPointer = paramPointer
+    self.paramSize = paramSize
+    self.modelPointer = modePointer
+    self.modelSize = modelSize
+    self.device = device
+    super.init()
+  }
+  @objc public init(device: MTLDevice) {
+    self.device = device
+    super.init()
+  }
+  @objc open func resultStr(res: ResultHolder) -> String {
+    fatalError()
+  }
+  @objc open func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    guard let inResPointer = paddleMobileRes.resultPointer else {
+      fatalError()
+    }
+    return ResultHolder.init(inResult: inResPointer, inCapacity: paddleMobileRes.capacity)
+  }
+  open func updateProgram(program: Program) {
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
@@ -12,25 +12,22 @@
 See the License for the specific language governing permissions and
 limitations under the License. */
-import Metal
 import MetalKit
 import Foundation
-@objc public enum Platform: Int{
+@objc public class ResultHolder: NSObject {
-  case CPU, GPU
+  @objc public let result: UnsafeMutablePointer<Float32>
-}
+  @objc public let capacity: Int
-class ScaleKernel: CusomKernel {
+  init(inResult: UnsafeMutablePointer<Float32>, inCapacity: Int) {
-  init(device: MTLDevice, shape: Shape) {
+    result = inResult
-    if computePrecision == .Float32 {
+    capacity = inCapacity
-      super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
-    } else if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false)
-    } else {
-      fatalError(" unsupport ")
-    }
  }
+  @objc public func releasePointer() {
+    result.deinitialize(count: capacity)
+    result.deallocate()
+  }
 }
 @objc public class Runner: NSObject {
@@ -40,86 +37,77 @@ class ScaleKernel: CusomKernel {
  var textureLoader: MTKTextureLoader?
  public let net: Net
  let device: MTLDevice?
-  let platform: Platform
-  var cpuPaddleMobile: PaddleMobileCPU?
  let numel: Int
-  let meansNumber: [NSNumber]
-  // dims num nchw
+  /// 初始化函数
-  let dimsNum: [NSNumber]
+  ///
-  /**
+  /// - Parameters:
-   * inNet:        需要运行的网络
+  ///   - inNet: 传入自定义的网络
-   * commandQueue: GPU 是需要传入
+  ///   - commandQueue: commandQueue
-   * inPlatform:   需要使用的平台, GPU or CPU
+  @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) {
-   */
+    guard inNet.inputDim.cout() == 4 else {
-  @objc public init(inNet: Net, commandQueue: MTLCommandQueue?, inPlatform: Platform) {
+      fatalError(" input dim count must 4 ")
+    }
    net = inNet
    queue = commandQueue
    device = queue?.device
-    platform = inPlatform
    if let inDevice = device {
      textureLoader = MTKTextureLoader.init(device: inDevice)
    }
-    if platform == .CPU {
+    numel = net.inputDim.numel()
-      cpuPaddleMobile = PaddleMobileCPU.init()
-    }
-    numel = net.dim.n * net.dim.c * net.dim.h * net.dim.w
-    meansNumber = net.means.map { NSNumber.init(value: $0) }
-    dimsNum = [NSNumber.init(value: net.dim.n),
-               NSNumber.init(value: net.dim.c),
-               NSNumber.init(value: net.dim.h),
-               NSNumber.init(value: net.dim.w)]
  }
-  /**
+  /// load 模型, 返回 true 可进行预测
-   * load 模型, 返回 true 可进行预测
+  ///
-   */
+  /// - Returns: load 成功或失败
  @objc public func load() -> Bool {
-    if platform == .GPU {
      guard let inDevice = device, let inQueue = queue else {
        print(" paddle mobile gpu load error, need MTLCommandQueue")
        return false
      }
      let loader = Loader<Float32>.init()
      do {
-//        program = try loader.load(device: inDevice, paramPointer: net.paramPointer!, paramSize: net.paramSize,modePointer:net.modelPointer!,modelSize:net.modelSize)
-        program = try loader.load(device: inDevice, modelPath: net.modelPath, paraPath: net.paramPath)
+        if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer {
+          guard net.paramSize > 0 && net.modelSize > 0 else {
+            print(" load from memory param size or model size can't 0 ")
+            return false
+          }
+          program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize)
+        } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath {
+          program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath)
+        } else {
+          print(" model pointer or model file path need be specified")
+          return false
+        }
+        let initContext: InitContext = InitContext.init()
+        initContext.metalLoadMode = net.metalLoadMode
+        initContext.metalLibPath = net.metalLibPath
+        executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext)
        net.updateProgram(program: program!)
-        executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!)
      } catch let error {
        print(error)
        return false
      }
-    } else {
-      return cpuPaddleMobile?.load(net.modelPath, andWeightsPath: net.paramPath) ?? false
-    }
    return true
  }
-  @objc public func predict(inputPointer: UnsafeMutablePointer<Float32>, completion: @escaping ( _ success: Bool, _ result: PaddleMobileCPUResult?) -> Void) {
+  /// 预测
+  ///
-    guard let res = cpuPaddleMobile?.predictInput(inputPointer, dim: dimsNum) else {
+  /// - Parameters:
-      completion(false, nil)
+  ///   - texture: 输入 texture 需要使用 getTexture 获得
-      return
+  ///   - completion: 结果回调， 当 success 为 true 时 result 不为 nil
-    }
-    completion(true, res)
-  }
-  /**
-   * GPU 版本 predict
-   * texture: 需要预测的 texture 需要做过预处理
-   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
-   */
  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: ResultHolder?) -> Void) {
    do {
-      try self.executor?.predict(input: texture, dim: [self.net.dim.n, self.net.dim.h, self.net.dim.w, self.net.dim.c], completionHandle: { [weak self] (res) in
+      try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in
        guard let SSelf = self else {
          fatalError( " self nil " )
        }
        let result = SSelf.net.fetchResult(paddleMobileRes: res)
        completion(true, result)
-      }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
+        }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
    } catch let error {
      print(error)
      completion(false, nil)
@@ -127,59 +115,64 @@ class ScaleKernel: CusomKernel {
    }
  }
-  /**
+  /// 清理内存, 调用此函数后, 不能再使用, 需重新 load
-   * CPU GPU 通用版本 predict
-   * cgImage: 需要预测的图片
-   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
-   */
-//  @objc public func predict(cgImage: CGImage, completion: @escaping ( _ success: Bool, _ resultArray: [Float32]) -> Void) {
-//    if platform == .GPU {
-//      getTexture(image: cgImage) { [weak self] (texture) in
-//        guard let SSelf = self else {
-//          fatalError( "" )
-//        }
-//        SSelf.predict(texture: texture, completion: completion)
-//      }
-//    } else if platform == .CPU {
-//      let input = preproccess(image: cgImage)
-//      predict(inputPointer: input, completion: completion)
-//      input.deinitialize(count: numel)
-//      input.deallocate()
-//    }
-//  }
-  /*
-   * 清理内存, 调用此函数后, 不能再使用, 需重新 load
-   */
  @objc public func clear() {
-    if platform == .GPU {
+    executor?.clear()
-      executor?.clear()
+    executor = nil
-      executor = nil
+    program = nil
-      program = nil
-    } else if platform == .CPU {
-      cpuPaddleMobile?.clear()
-    }
  }
-  @objc public func preproccess(image: CGImage) -> UnsafeMutablePointer<Float> {
+  /// 获取 texture, 对 texture 进行预处理, 预测时使用
-    let output = UnsafeMutablePointer<Float>.allocate(capacity: numel)
+  ///
-    let means = net.means.map { NSNumber.init(value: $0) }
+  /// - Parameters:
-    let dims = [NSNumber.init(value: net.dim.n),
+  ///   - image: 输入图像
-                NSNumber.init(value: net.dim.c),
+  ///   - getTexture: 获取 texture 回调
-                NSNumber.init(value: net.dim.h),
-                NSNumber.init(value: net.dim.w)]
-    cpuPaddleMobile?.preprocess(image, output: output, means: means, scale: net.scale, dim: dims)
-    return output
-  }
-  /*
-   * 获取 texture, 对 texture 进行预处理, GPU 预测时使用
-   */
  @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
    let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
    scaleTexture(input: texture!, complete: getTexture)
  }
+  /// 通过 buffer 获取 texture， 内部会使用GPU进行转换操作
+  ///
+  /// - Parameters:
+  ///   - inBuffer: 输入buffer
+  ///   - getTexture: 结果回调
+  @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) {
+    guard let inQueue = queue, let inDevice = device else {
+      fatalError( " queue or devcie nil " )
+    }
+    guard let buffer = inQueue.makeCommandBuffer() else {
+      fatalError( " make buffer error" )
+    }
+    let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
+    do {
+      try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer)
+    } catch {
+      fatalError(" bufferToTextureKernel error ")
+    }
+    buffer.addCompletedHandler { (buffer) in
+      getTexture(bufferToTextureKernel.outputTexture)
+    }
+    buffer.commit()
+  }
+  /// 更新输入维度， 针对可变长输入模型
+  ///
+  /// - Parameter inDim: 输入维度
+  @objc public func updateInputDim(inDim: Dim) {
+    if net.inputDim != inDim {
+      guard let inProgram = program else {
+        fatalError(" need load first ")
+      }
+      net.inputDim = inDim
+      net.updateProgram(program: inProgram)
+    }
+  }
  public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
    guard let inQueue = queue, let inDevice = device else {
@@ -190,7 +183,7 @@ class ScaleKernel: CusomKernel {
      fatalError( " make buffer error" )
    }
-    let scaleKernel = ScaleKernel.init(device: inDevice, shape: CusomKernel.Shape.init(inWidth: net.dim.w, inHeight: net.dim.h, inChannel: 3))
+    let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath)
    do {
      try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
@@ -205,5 +198,3 @@ class ScaleKernel: CusomKernel {
    buffer.commit()
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
+++ b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-#pragma once
-#import <CoreImage/CoreImage.h>
-#import <Foundation/Foundation.h>
-@interface PaddleMobileCPUResult: NSObject
-@property (assign, nonatomic, readonly) float *output;
-@property (assign, nonatomic, readonly) int outputSize;
-(void)releaseOutput;
-@end
-@interface PaddleMobileCPU : NSObject
-/*
-    创建对象
-*/
- (instancetype)init;
-/*
-    load 模型, 开辟内存
-*/
- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-/*
-  加载散开形式的模型, 需传入模型的目录
-*/
- (BOOL)load:(NSString *)modelAndWeightPath;
-/*
- * 从内存中加载模型
- * */
- (BOOL)LoadCombinedMemory:(size_t)modelLen
-               andModelBuf:(const uint8_t *)modelBuf
-         andModelParamsLen:(size_t)combinedParamsLen
-      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
-/*
- *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
- * */
-(void)preprocess:(CGImageRef)image
-           output:(float *)output
-            means:(NSArray<NSNumber *> *)means
-        scale:(float)scale
-        dim:(NSArray<NSNumber *> *)dim;
-/*
- * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
- * */
- (PaddleMobileCPUResult *)predictInput:(float *)input
-                                    dim:(NSArray<NSNumber *> *)dim;
-/*
-    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
-*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-/*
-    进行预测, 默认 means 为 0, scale 为 1.0
-*/
- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-/*
-    清理内存
-*/
- (void)clear;
-@end
--- a/metal/paddle-mobile/paddle-mobile/MobileNetCombined.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobileNetCombined.swift
-//
-//  MobileNetCombined.swift
-//  paddle-mobile
-//
-//  Created by Xiao,Haichun on 2018/12/5.
-//  Copyright © 2018 orange. All rights reserved.
-//
-import Foundation
-public class MobileNetCombined: Net {
-    @objc public override init(device: MTLDevice) {
-        super.init(device: device)
-        means = [0, 0, 0]
-        scale = 1
-        except = 0
-        modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null"
-        paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null"
-        modelDir = ""
-        //preprocessKernel = GenetPreProccess.init(device: device)
-        dim = (n: 1, h: 416, w: 416, c: 3)
-    }
-    @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
-        super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
-        means = [0, 0, 0]
-        scale = 1
-        except = 0
-        modelPath = ""
-        paramPath = ""
-        modelDir = ""
-        //preprocessKernel = GenetPreProccess.init(device: device)
-        dim = (n: 1, h: 416, w: 416, c: 3)
-    }
-    //    class GenetPreProccess: CusomKernel {
-    //        init(device: MTLDevice) {
-    //            let s = CusomKernel.Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
-    //            super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, usePaddleMobileLib: false)
-    //        }
-    //    }
-    override  public func resultStr(res: ResultHolder) -> String {
-        //    fatalError()
-        return " \(res.result![0]) ... "
-    }
-}
--- a/metal/paddle-mobile/paddle-mobile/Net.swift
+++ b/metal/paddle-mobile/paddle-mobile/Net.swift
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-import Foundation
-import Metal
-public class ResultHolder: NSObject {
-  @objc public let result: UnsafeMutablePointer<Float32>?
-  @objc public let capacity: Int
-  init(inResult: UnsafeMutablePointer<Float32>?, inCapacity: Int) {
-    result = inResult
-    capacity = inCapacity
-  }
-  @objc public func releasePointer() {
-    result?.deinitialize(count: capacity)
-    result?.deallocate()
-  }
-}
-public class Net: NSObject {
-  var except: Int = 0
-  var means: [Float] = []
-  var scale: Float = 0.0
-  var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
-  var preprocessKernel: CusomKernel? = nil
-  var paramPointer: UnsafeMutableRawPointer? = nil
-  var paramSize: Int = 0
-  var modelPointer: UnsafeMutableRawPointer? = nil
-  var modelSize: Int = 0
-  var modelPath: String = ""
-  var paramPath: String = ""
-  var modelDir: String = ""
-  @objc public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
-      self.paramPointer = paramPointer
-      self.paramSize = paramSize
-      self.modelPointer = modePointer
-      self.modelSize = modelSize
-      super.init()
-  }
-  public func resultStr(res: ResultHolder) -> String {
-    fatalError()
-  }
-  func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
-    return ResultHolder.init(inResult: paddleMobileRes.resultPointer, inCapacity: paddleMobileRes.capacity)
-  }
-  @objc public init(device: MTLDevice) {
-    super.init()
-  }
-  func updateProgram(program: Program) {
-  }
-}
--- a/metal/paddle-mobile/paddle-mobile/Common/Errors.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Errors.swift
--- a/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
@@ -110,9 +110,27 @@ extension Array {
      return newArray
    }
  }
+  public static func floatArrWithBuffer(floatArrBuffer: UnsafeMutablePointer<Float32>, count: Int) -> [Float32] {
+    var arr: [Float32] = []
+    for i in 0..<count {
+      arr.append(floatArrBuffer[i])
+    }
+    return arr
+  }
+}
+extension UnsafeMutablePointer {
+  public func floatArr(count: Int) -> [Pointee]{
+    var arr: [Pointee] = []
+    for i in 0..<count {
+      arr.append(self[i])
+    }
+    return arr
+  }
 }
-extension String{
+extension String {
  func cStr() -> UnsafePointer<Int8>? {
    return (self as NSString).utf8String
  }

--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -18,6 +18,7 @@ import CoreMedia
 fileprivate var defaultMetalLibrary: MTLLibrary?
 fileprivate var paddleMobileMetalLibrary: MTLLibrary?
+fileprivate var customMetalLibrary: MTLLibrary?
 extension MTLDevice {
  func defaultLibrary() -> MTLLibrary {
@@ -31,6 +32,22 @@ extension MTLDevice {
    }
  }
+  func customLibrary(metalLibPath: String) -> MTLLibrary {
+    if customMetalLibrary == nil {
+      do {
+        customMetalLibrary = try makeLibrary(filepath: metalLibPath)
+      } catch  let error {
+        fatalError("\(error)")
+      }
+    }
+    if let inMetalLib = customMetalLibrary {
+      return inMetalLib
+    } else {
+      fatalError(" customlib is nil ")
+    }
+  }
  func paddleMobileLibrary() -> MTLLibrary {
    if paddleMobileMetalLibrary == nil {
      guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
@@ -50,8 +67,19 @@ extension MTLDevice {
    }
  }
-  func pipeLine(funcName: String, inPaddleMobileLib: Bool = true) -> MTLComputePipelineState {
+  func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState {
-    let useLib = inPaddleMobileLib ? paddleMobileLibrary() : defaultLibrary()
+    let useLib: MTLLibrary
+    switch metalLoadMode {
+    case .LoadMetalInDefaultLib:
+      useLib = defaultLibrary()
+    case .LoadMetalInPaddleMobile:
+      useLib = paddleMobileLibrary()
+    case .LoadMetalInCustomMetalLib:
+      useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ")
+    default:
+      fatalError()
+    }
    guard let function = useLib.makeFunction(name: funcName) else {
      fatalError(" function " + funcName + " not found")
    }
@@ -501,7 +529,7 @@ public extension MTLTexture {
    } else {
      fatalError(" 目前还不支持其他类型 ")
    }
+    print(textureArray.count)
    var output: [Float32] = []
    for s in 0..<arrayLength {
      for c in 0..<4{

--- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
@@ -324,9 +324,10 @@ public class PaddleMobileUnitTest {
        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
+      let initContext = InitContext.init()
+      initContext.metalLoadMode = .LoadMetalInDefaultLib
+      let convAddBnReluKernel = ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param, initContext: initContext)
-        let convAddBnReluKernel = ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param)
        convAddBnReluKernel.test(commandBuffer: buffer, param: param)

--- a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
--- a/metal/paddle-mobile/paddle-mobile/Common/Types.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
@@ -250,39 +250,40 @@ extension InputTexture: Variant {
 }
 extension MTLTexture where Self: Variant {
 }
-class FetchHolder: Variant {
+public class FetchHolder: Variant {
  var resultBuffer: MTLBuffer?
-  var dim: [Int]
+  public var dim: Dim
-  var capacity: Int
+  public var capacity: Int
+  public var paddedCapacity: Int
-  init(inCapacity: Int, inDim: [Int]) {
+  init(inPaddedCapacity: Int, inDim: Dim) {
-    capacity = inCapacity
+    paddedCapacity = inPaddedCapacity
+    capacity = inDim.numel()
    dim = inDim
  }
-  func initBuffer(device: MTLDevice) {
+  public func initBuffer(device: MTLDevice) {
-    resultBuffer = device.makeBuffer(length: capacity * 4, options: [])
+    resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: [])
  }
  var result: UnsafeMutablePointer<Float32> {
    guard let inResultBuffer = resultBuffer else {
      fatalError()
    }
-    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: capacity)
+    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity)
  }
 }
 extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
-  var description: String {
+  public var description: String {
    fatalError()
 //    return "\(result)"
  }
-  var debugDescription: String {
+  public var debugDescription: String {
    fatalError()
 //    return "\(result)"
  }

--- a/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
@@ -14,39 +14,42 @@
 import Foundation
-public struct Dim {
+@objc public class Dim: NSObject {
-    public init(inDim: [Int]) {
+  private(set) var dims: [Int]
-        dims = inDim
-    }
+  @objc public init(inDim: [Int]) {
+    dims = inDim
-    mutating func swapeDimAt(index1: Int, index2: Int) {
+  }
-        dims.swapAt(index1, index2)
-    }
+  public func cout() -> Int {
+    return dims.count
-    func cout() -> Int {
+  }
-        return dims.count
-    }
+  public func numel() -> Int {
+    return dims.reduce(1) { $0 * $1 }
-    func numel() -> Int {
+  }
-        return dims.reduce(1) { $0 * $1 }
-    }
+  public static func ==(left: Dim, right: Dim) -> Bool {
+    return left.dims == right.dims;
-    public static func ==(left: Dim, right: Dim) -> Bool {
+  }
-        return left.dims == right.dims;
-    }
+  public static func !=(left: Dim, right: Dim) -> Bool {
+    return left.dims != right.dims;
-    public subscript(index: Int) -> Int {
+  }
-        return dims[index];
-    }
+  public subscript(index: Int) -> Int {
+    return dims[index];
-    private(set) var dims: [Int]
+  }
-    private init(){
-        fatalError()
+  public override var description: String {
-    }
+    return "\(dims)"
-}
+  }
-extension Dim: CustomStringConvertible {
+  func swapeDimAt(index1: Int, index2: Int) {
-    public var description: String {
+    dims.swapAt(index1, index2)
-        return "\(dims)"
+  }
-    }
+  private override init(){
+    fatalError()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
@@ -15,19 +15,16 @@
 import Foundation
-let testTo = 81
+let testTo = 5
 var isTest = false
-let computePrecision: ComputePrecision = .Float16
+@objc public class GPUResultHolder: NSObject{
+  @objc public let dim: [Int]
-public class GPUResultHolder {
+  @objc public let capacity: Int
-  public let dim: [Int]
+  @objc public var resultPointer: UnsafeMutablePointer<Float32>?
-  public let capacity: Int
+  @objc public var intermediateResults: [String : [MTLBuffer]]?
-  public var resultPointer: UnsafeMutablePointer<Float32>?
+  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) {
-  public var intermediateResults: [String : [Variant]]?
-  public let elapsedTime: Double
-  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
    dim = inDim
    capacity = inCapacity
@@ -36,64 +33,34 @@ public class GPUResultHolder {
      resultPointer?.initialize(from: inInPointer, count: inCapacity)
    }
-    elapsedTime = inElapsedTime
    intermediateResults = inIntermediateResults
  }
-}
+  public override var description: String {
-extension GPUResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
-  public var debugDescription: String {
-//    var str = ""
-//    str += "Dim: \(dim) \n value:[ "
-//    if resultArr.count < 20 {
-//      for d in resultArr {
-//        str += " \(d) "
-//      }
-//    } else {
-//      for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
-//        str += " \(resultArr[d]) "
-//      }
-//    }
-//    str += " ]"
-//    return str
    fatalError()
  }
-  public var description: String {
-    return debugDescription
-  }
 }
 public class Executor<P: PrecisionType> {
  var ops: [Runable & InferShaperable] = []
+  var preInputDim: Dim = Dim.init(inDim: [])
  let program: Program
  let device: MTLDevice
  let inflightSemaphore: DispatchSemaphore
  let queue: MTLCommandQueue
-  public init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program) throws {
+  init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws {
-    self.inflightSemaphore = DispatchSemaphore(value: 3)
+    self.inflightSemaphore = DispatchSemaphore(value: 1)
    program = inProgram
    device = inDevice
    queue = inQueue
-//    print("before for ")
-//print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
    for block in inProgram.programDesc.blocks {
      //block.ops.count
      for i in 0..<block.ops.count {
        let opDesc = block.ops[i]
        do {
-//          print("in for i \(i): ")
+          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext)
-//      print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
-//
-//          if i == 56 {
-//          print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
-//
-//          }
-          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope)
          ops.append(op)
        } catch let error {
          throw error
@@ -102,11 +69,12 @@ public class Executor<P: PrecisionType> {
    }
  }
-  public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (GPUResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+  public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping (GPUResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+    inflightSemaphore.wait()
    guard let buffer = queue.makeCommandBuffer() else {
      throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
    }
-    inflightSemaphore.wait()
    let resInput: MTLTexture
    if let inPre = preProcessKernle {
@@ -120,8 +88,7 @@ public class Executor<P: PrecisionType> {
      resInput = input
    }
-    let beforeDate = Date.init()
+    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim)
-    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: dim))
    program.scope.setInput(input: inputTexture)
    //(ops.count - except)
    for i in 0..<(ops.count - except) {
@@ -133,64 +100,45 @@ public class Executor<P: PrecisionType> {
      }
    }
-    var outputTextures: [String : [Variant]]?
+    var outputTextures: [String : [MTLBuffer]]?
    if except > 0 {
      ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
      outputTextures = ops[ops.count - except].inputVariant()
    }
    buffer.addCompletedHandler { [weak self] (commandbuffer) in
-//      let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
-//      print(inputArr.strideArray())
-//
-////      print(dim)
-//      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
-//      print(" write done ")
-//      print("write to library done")
-//      return
-//                  print(inputArr)
-//
-//                  let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
-//                  print(stridableInput)
-//
-//                  let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-//      for i in 0..<self!.ops.count {
-//        let op = self!.ops[i]
-//        print(" 第 \(i) 个 op: ")
-//        op.delogOutput()
-//      }
-//      return;
-//      self!.ops[testTo - 2].delogOutput()
-//      self!.ops[testTo - 1].delogOutput()
-//      self!.ops[5].delogOutput()
-//      return
      guard let SSelf = self else {
-//        return
        fatalError()
      }
+      //将输入写进文件
+      /*
+       let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+       print(dim)
+       writeToLibrary(fileName: "test_image_super", array: inputArr)
+       print(" write done ")
+       return
+       */
+      /*    输出 op 计算结果
+       for op in SSelf.ops {
+       op.delogOutput()
+       }
+       */
-      let afterDate = Date.init()
      var resultHolder: GPUResultHolder
      if except > 0 {
-        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
+        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0,  inIntermediateResults: outputTextures)
      } else {
        let outputVar: Variant = SSelf.program.scope.output()!
        let output: FetchHolder = outputVar as! FetchHolder
-//        let beforeToTensorDate = Date.init()
+        resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity)
-        resultHolder = GPUResultHolder.init(inDim: output.dim, inPointer: output.result, inCapacity: output.capacity, inElapsedTime: afterDate.timeIntervalSince(beforeDate))
-//        let timeToTensor = Date.init().timeIntervalSince(beforeToTensorDate)
-//        print(timeToTensor)
      }
      completionHandle(resultHolder)
      SSelf.inflightSemaphore.signal()
    }
    buffer.commit()
  }

--- a/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
@@ -150,6 +150,9 @@ public class Loader<P: PrecisionType> {
      let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
      let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
+//      let programDesc = ProgramDesc.init(protoProgram: protoProgram)
      print(programDesc)
      guard programDesc.blocks.count > 0 else {
@@ -210,7 +213,7 @@ public class Loader<P: PrecisionType> {
              scope[varDesc.name] = tensor
            } else {
              let dim = Dim.init(inDim: tensorDesc.dims)
-              scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
+              scope[varDesc.name] = Texture.init(device: device, inDim: dim)
            }
          } else {
            if varDesc.name == fetchKey {

--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -28,9 +28,7 @@ extension Tensorial {
  }
 }
-public enum ComputePrecision {
-  case Float32, Float16
-}
 class Tensor<P: PrecisionType>: Tensorial {
@@ -97,7 +95,7 @@ class Tensor<P: PrecisionType>: Tensorial {
-  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) {
+  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) {
    if convertToNHWC {
 //      print(layout)
      convert(to: DataLayout.NHWC())
@@ -145,7 +143,7 @@ class Tensor<P: PrecisionType>: Tensorial {
          case .Float16:
            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
          }
-        } else if C == 1 {
+        } else if C == 1 && !padWhenOneC {
          buffer = device.makeBuffer(length: numel() * precisionSize)
          switch precision {
          case .Float32:
@@ -238,10 +236,32 @@ class Tensor<P: PrecisionType>: Tensorial {
    data.release()
  }
+  var n: Int {
+    get {
+      if dim.cout() == 4 {
+        if layout == DataLayout.NCHW() {
+          return dim[0]
+        } else if layout == DataLayout.NHWC() {
+          return dim[0]
+        } else {
+          fatalError(" unsupport ")
+        }
+      } else {
+        fatalError()
+      }
+    }
+  }
  var width: Int {
    get {
      if dim.cout() == 4 {
-        return dim[1]
+        if layout == DataLayout.NHWC() {
+          return dim[2]
+        } else if layout == DataLayout.NCHW() {
+          return dim[3]
+        } else {
+          fatalError(" unsupport ")
+        }
      } else {
        fatalError()
      }
@@ -251,7 +271,13 @@ class Tensor<P: PrecisionType>: Tensorial {
  var height: Int {
    get {
      if dim.cout() == 4 {
-        return dim[2]
+        if layout == DataLayout.NHWC() {
+          return dim[1]
+        } else if layout == DataLayout.NCHW() {
+          return dim[2]
+        } else {
+          fatalError(" unsupport ")
+        }
      } else {
        fatalError()
      }
@@ -261,7 +287,13 @@ class Tensor<P: PrecisionType>: Tensorial {
  var channel: Int {
    get {
      if dim.cout() == 4 {
-        return dim[3]
+        if layout == DataLayout.NHWC() {
+          return dim[3]
+        } else if layout == DataLayout.NCHW() {
+          return dim[1]
+        } else {
+          fatalError(" unsupport ")
+        }
      } else {
        fatalError()
      }

--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -68,16 +68,20 @@ extension InputTexture {
 .height = 1
 .len = 1
 */
+public class Texture: Tensorial {
+  public var dim: Dim
-public class Texture<P: PrecisionType>: Tensorial {
-  var dim: Dim
  public var tensorDim: Dim
+  /// tensor dim pad to four
  public var padToFourDim: Dim
  private var textureDesc: MTLTextureDescriptor!
  public var metalTexture: MTLTexture!
  var transpose: [Int] = [0, 1, 2, 3]
+  func elementCount() -> Int {
+    return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4
+  }
  func toTensor() -> [Float32] {
    guard  padToFourDim.cout() == 4 else {
      fatalError("- not support -")
@@ -92,15 +96,15 @@ public class Texture<P: PrecisionType>: Tensorial {
    return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
  }
-  func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
+  public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
    transpose = inTranspose
    for i in 0..<(4 - tensorDim.cout()) {
      if i != inTranspose[i] {
        fatalError()
      }
    }
-    let newDim = transpose.map { padToFourDim[$0] }
+    let newDim = transpose.map { padToFourDim[$0] }
    let newLayout = transpose.map { layout.layoutWithDim[$0] }
    layout = DataLayout.init(newLayout)
@@ -139,7 +143,29 @@ public class Texture<P: PrecisionType>: Tensorial {
    metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
  }
+  public func updateDims(inTensorDim: Dim, inDim: Dim) {
+    var fourDim: Dim
+    if inDim.cout() == 4 {
+      fourDim = inDim
+    } else if inDim.cout() < 4 {
+      var fourDimNum: [Int] = []
+      for _ in 0..<(4 - inDim.cout()) {
+        fourDimNum.append(1)
+      }
+      fourDimNum.append(contentsOf: inDim.dims)
+      fourDim = Dim.init(inDim: fourDimNum)
+    } else {
+      fatalError(" not support ")
+    }
+    tensorDim = inTensorDim
+    dim = fourDim
+    padToFourDim = fourDim
+  }
+  // 初始化时 dim padToFourDim 模型中的维度（一般来说 nchw），前面补全0
  init(device: MTLDevice, inDim: Dim) {
+    print(" in dim > \(inDim)")
    var fourDim: Dim
    if inDim.cout() == 4 {
      fourDim = inDim

--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -27,19 +27,19 @@ class OpCreator<P: PrecisionType> {
        }
    }
-    func creat(device: MTLDevice, opDesc: OpDesc, scope: Scope) throws -> Runable & InferShaperable {
+  func creat(device: MTLDevice, opDesc: OpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable {
        guard let opCreator = opCreators[opDesc.type] else {
            throw PaddleMobileError.opError(message: "there is no " + opDesc.type + " yet")
        }
        do {
-            return try opCreator(device, opDesc, scope)
+            return try opCreator(device, opDesc, scope, initContext)
        } catch let error {
            throw error
        }
    }
-    let opCreators: [String : (MTLDevice, OpDesc, Scope) throws -> Runable & InferShaperable] =
+    let opCreators: [String : (MTLDevice, OpDesc, Scope, InitContext) throws -> Runable & InferShaperable] =
        [gConvType                  :     ConvOp<P>.creat,
         gBatchNormType             :     BatchNormOp<P>.creat,
         gReluType                  :     ReluOp<P>.creat,

--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -31,7 +31,7 @@ protocol Runable {
  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
  func delogOutput()
-  func inputVariant() -> [String : [Variant]]
+  func inputVariant() -> [String : [MTLBuffer]]
  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
 }
@@ -44,7 +44,7 @@ extension Runable where Self: OperatorProtocol{
    }
  }
-  func inputVariant() -> [String : [Variant]] {
+  func inputVariant() -> [String : [MTLBuffer]] {
 //    return [:]
    fatalError(" op \(type) need implement inputVariant")
  }
@@ -59,15 +59,26 @@ extension Runable where Self: OperatorProtocol{
  }
 }
+public class InitContext {
+  /// metal 代码加载方式
+  var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib
+  /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时， metal library 路径不能为空
+  var metalLibPath: String? = nil
+  init() {
+    metalLoadMode = .LoadMetalInDefaultLib
+    metalLibPath = nil
+  }
+}
 protocol Creator where Self: OperatorProtocol{
  associatedtype OpType: OperatorProtocol & Runable & InferShaperable
-  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope, initContext: InitContext) throws -> OpType
 }
 extension Creator where Self: OperatorProtocol {
-  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType {
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope, initContext: InitContext) throws -> OpType {
    do {
-      return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope)
+      return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext)
    } catch let error {
      throw error
    }
@@ -89,13 +100,13 @@ protocol OperatorProtocol {
  var attrs: [String : Attr] { get }
  var para: ParamType { get }
  var kernel: KerType { get }
-  init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws
+  init(device: MTLDevice, opDesc: OpDesc, inScope: Scope, initContext: InitContext) throws
 }
 extension OperatorProtocol {
-  static func provide(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> Self {
+  static func provide(device: MTLDevice, opDesc: OpDesc, inScope: Scope, initContext: InitContext) throws -> Self {
    do {
-      return try Self.init(device: device, opDesc: opDesc, inScope: inScope)
+      return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext)
    } catch let error {
      throw error
    }
@@ -103,18 +114,7 @@ extension OperatorProtocol {
 }
 class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where KernelType.ParamType == ParameterType {
-  typealias ParamType = ParameterType
+  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope, initContext: InitContext) throws {
-  typealias KerType = KernelType
-  let type: String
-  let inputs: [String : [String]]
-  var paraInputs: [String : [String]]
-  let outpus: [String : [String]]
-  let attrs: [String : Attr]
-  let para: ParamType
-  let scope: Scope
-  var kernel: KerType
-  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
-//    print("create op: \(opDesc.type)")
    type = opDesc.type
    scope = inScope
    inputs = opDesc.inputs
@@ -126,8 +126,19 @@ class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where
    } catch let error {
      throw error
    }
-    kernel = KernelType.init(device: device, param: para)
+    kernel = KernelType.init(device: device, param: para, initContext: initContext)
  }
+  typealias ParamType = ParameterType
+  typealias KerType = KernelType
+  let type: String
+  let inputs: [String : [String]]
+  var paraInputs: [String : [String]]
+  let outpus: [String : [String]]
+  let attrs: [String : Attr]
+  let para: ParamType
+  let scope: Scope
+  var kernel: KerType
 }
 // op infos

--- a/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
@@ -34,8 +34,8 @@ class BatchNormParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
  let bias: Tensor<P>
  let mean: Tensor<P>
  let scale: Tensor<P>

--- a/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
@@ -30,8 +30,8 @@ class BilinearInterpParam<P: PrecisionType>: OpParam {
      fatalError()
    }
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
  let out_h: Int
  let out_w: Int
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -37,10 +37,10 @@ class BoxcoderParam<P: PrecisionType>: OpParam {
    assert(codeType == "decode_center_size") // encode_center_size is not implemented
    assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
  }
-  let priorBox: Texture<P>
+  let priorBox: Texture
-  let priorBoxVar: Texture<P>
+  let priorBoxVar: Texture
-  let targetBox: Texture<P>
+  let targetBox: Texture
-  var output: Texture<P>
+  var output: Texture
  let codeType: String
  let boxNormalized: Bool
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -22,7 +22,7 @@ class ConcatParam<P: PrecisionType>: OpParam {
        fatalError()
      }
      for x in xlist {
-        guard let variant = inScope[x], let v = variant as? Texture<P> else {
+        guard let variant = inScope[x], let v = variant as? Texture else {
          fatalError()
        }
        if transpose.count == 0 {
@@ -40,8 +40,8 @@ class ConcatParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  var input: [Texture<P>] = []
+  var input: [Texture] = []
-  var output: Texture<P>
+  var output: Texture
  var transpose: [Int] = []
  let axis: Int
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
@@ -34,12 +34,12 @@ class ConvAddAddPreluParam<P: PrecisionType>: OpParam {
    }
  }
-  let input: Texture<P>
+  let input: Texture
  let y: Tensor<P>
  let filter: Tensor<P>
  let mode: String
  let alpha: Tensor<P>
-  var output: Texture<P>
+  var output: Texture
  let stride: [Int32]
  let paddings: [Int32]
  let dilations: [Int32]

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -40,7 +40,7 @@ class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
    }
  }
-  let input: Texture<P>
+  let input: Texture
  let variance: Tensor<P>
  let bias: Tensor<P>
@@ -52,7 +52,7 @@ class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
  var newScale: MTLBuffer?
  var newBiase: MTLBuffer?
-  var output: Texture<P>
+  var output: Texture
  let stride: [Int32]
  let paddings: [Int32]
  let dilations: [Int32]

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -32,11 +32,11 @@ class ConvAddParam<P: PrecisionType>: OpParam {
    }
  }
-  let input: Texture<P>
+  let input: Texture
  let y: Tensor<P>
  let filter: Tensor<P>
-  var output: Texture<P>
+  var output: Texture
  let stride: [Int32]
  let paddings: [Int32]
  let dilations: [Int32]
@@ -111,6 +111,7 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
 //    print(biase)
    print(" \(type) output: ")
+    print(para.output.metalTexture)
    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
@@ -33,12 +33,12 @@ class ConvAddPreluParam<P: PrecisionType>: OpParam {
    }
  }
-  let input: Texture<P>
+  let input: Texture
  let y: Tensor<P>
  let filter: Tensor<P>
  let mode: String
  let alpha: Tensor<P>
-  var output: Texture<P>
+  var output: Texture
  let stride: [Int32]
  let paddings: [Int32]
  let dilations: [Int32]

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
@@ -36,8 +36,7 @@ class ConvBNReluParam<P: PrecisionType>: OpParam {
    }
  }
-  let input: Texture<P>
+  let input: Texture
  let variance: Tensor<P>
  let bias: Tensor<P>
  let mean: Tensor<P>
@@ -47,7 +46,7 @@ class ConvBNReluParam<P: PrecisionType>: OpParam {
  var newScale: MTLBuffer?
  var newBiase: MTLBuffer?
-  var output: Texture<P>
+  var output: Texture
  let stride: [Int32]
  let paddings: [Int32]
  let dilations: [Int32]

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
@@ -31,9 +31,9 @@ class ConvParam<P: PrecisionType>: OpParam {
    }
  }
-  let input: Texture<P>
+  let input: Texture
  let filter: Tensor<P>
-  var output: Texture<P>
+  var output: Texture
  let stride: [Int32]
  let paddings: [Int32]
  let dilations: [Int32]

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
@@ -17,14 +17,6 @@ import Foundation
 class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
  typealias OpType = DepthConvOp<P>
-  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
-    do {
-      try super.init(device: device, opDesc: opDesc, inScope: inScope)
-    } catch let error {
-      throw error
-    }
-  }
  func inferShape() {
    let inDims = para.input.dim

--- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -32,7 +32,7 @@ class ElementwiseAddParam<P: PrecisionType>: OpParam {
      let device = inputX.metalTexture!.device
      inputY = Texture.init(device: device, inDim: tensorY.dim)
      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
    }
 //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
@@ -55,9 +55,9 @@ class ElementwiseAddParam<P: PrecisionType>: OpParam {
    }
  }
-  var inputX: Texture<P>
+  var inputX: Texture
-  var inputY: Texture<P>
+  var inputY: Texture
-  var output: Texture<P>
+  var output: Texture
  var axis: Int
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
@@ -34,7 +34,7 @@ class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
      let device = inputX.metalTexture!.device
      inputY = Texture.init(device: device, inDim: tensorY.dim)
      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
-      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision)
    }
    //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
@@ -59,9 +59,9 @@ class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
  let mode: String
  let alpha: Tensor<P>
-  var inputX: Texture<P>
+  var inputX: Texture
-  var inputY: Texture<P>
+  var inputY: Texture
-  var output: Texture<P>
+  var output: Texture
  var axis: Int
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
@@ -17,7 +17,7 @@ import MetalKit
 import CoreMedia
 class FeedParam<P: PrecisionType>: OpParam{
-  var output: Texture<P>
+  var output: Texture
  var input: InputTexture {
    return scope.input() as! InputTexture
  }
@@ -63,7 +63,8 @@ class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<
  func delogOutput() {
    print(" \(type) output: ")
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+    print(para.output.metalTexture)
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray())
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+import Metal
+class FetchParam<P: PrecisionType>: OpParam{
+  var output: FetchHolder
+  let input: Texture
+  let scope: Scope
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    scope = inScope
+    do {
+      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim)
+      scope.setOutput(output: output)
+    } catch let error {
+      throw error
+    }
+  }
+  //typealias ParamPrecisionType = P
+}
+class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
+  typealias OpType = FetchOp<P>
+  func inferShape() {
+    print(para.input.dim)
+  }
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  func delogOutput() {
+    print("fetch output: ")
+    let resArr = self.para.output.result.floatArr(count: self.para.output.capacity)
+    print(resArr.strideArray())
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
@@ -25,8 +25,8 @@ class FlattenParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
  let axis: Int
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
@@ -21,14 +21,14 @@ public protocol TestParam {
 public protocol Testable {
  associatedtype TestParamType: TestParam
  func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
-  init(device: MTLDevice, testParam: TestParamType)
+  init(device: MTLDevice, testParam: TestParamType, initContext: InitContext)
 }
 protocol Computable {
  associatedtype ParamType: OpParam
  func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
-  init(device: MTLDevice, param: ParamType)
+  init(device: MTLDevice, param: ParamType, initContext: InitContext)
 }
 protocol KernelProtocol {
@@ -37,37 +37,83 @@ protocol KernelProtocol {
 }
-open class Kernel {
+@objc open class Kernel: NSObject{
  let pipline: MTLComputePipelineState
  let functionName: String
-  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
+  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) {
-    pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
+    pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
    functionName = inFunctionName
  }
 }
-open class CusomKernel: Kernel {
+@objc public class Shape: NSObject {
-  public struct Shape {
+  public let width: Int
-    public let width: Int
+  public let height: Int
-    public let height: Int
+  public let channel: Int
-    public let channel: Int
+  @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){
-    public init(inWidth: Int, inHeight: Int, inChannel: Int){
+    width = inWidth
-      width = inWidth
+    height = inHeight
-      height = inHeight
+    channel = inChannel
-      channel = inChannel
+  }
+}
+open class BufferToTextureKernel: Kernel {
+  public let outputTexture: MTLTexture
+  public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.textureType = .type2D
+    textureDesc.width = outputDim.width
+    textureDesc.height = outputDim.height
+    textureDesc.depth = (outputDim.channel + 3) / 4
+    if GlobalConfig.shared.computePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    } else {
+      fatalError()
    }
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.storageMode = .shared
+    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+    let initContext = InitContext.init()
+    initContext.metalLibPath = metalLibPath
+    initContext.metalLoadMode = metalLoadMode
+    if GlobalConfig.shared.computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext)
+    } else {
+      super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext)
+    }
+  }
+  public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setBuffer(inputBuffer, offset: 0, index: 0)
+    encoder.setTexture(outputTexture, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+    encoder.endEncoding()
  }
+}
+@objc open class CusomKernel: Kernel {
  public let outputTexture: MTLTexture
-  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
+  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) {
    let textureDesc = MTLTextureDescriptor.init()
    textureDesc.textureType = .type2D
    textureDesc.width = outputDim.width
    textureDesc.height = outputDim.height
    textureDesc.depth = (outputDim.channel + 3) / 4
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
      textureDesc.pixelFormat = .rgba16Float
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
      textureDesc.pixelFormat = .rgba32Float
    } else {
      fatalError()
@@ -77,7 +123,10 @@ open class CusomKernel: Kernel {
    textureDesc.storageMode = .shared
    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
-    super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+    let context = InitContext.init()
+    context.metalLoadMode = metalLoadModel
+    context.metalLibPath = metalLibPath
+    super.init(device: device, inFunctionName: inFunctionName, initContext: context)
  }
  public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
@@ -15,7 +15,7 @@
 import Foundation
 class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
-  required init(device: MTLDevice, param: BatchNormParam<P>) {
+  required init(device: MTLDevice, param: BatchNormParam<P>, initContext: InitContext) {
    let count = param.variance.dim.numel()
    let varianceP = param.variance.data.pointer
    let meanP = param.mean.data.pointer
@@ -27,13 +27,13 @@ class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
      scaleP[i] = invStd * scaleP[i]
    }
-    param.bias.initBuffer(device: device, precision: computePrecision)
+    param.bias.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.scale.initBuffer(device: device, precision: computePrecision)
+    param.scale.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "batchnorm")
+      super.init(device: device, inFunctionName: "batchnorm", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "batchnorm_half")
+      super.init(device: device, inFunctionName: "batchnorm_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
@@ -41,12 +41,12 @@ class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: BilinearInterpParam<P>) {
+  required init(device: MTLDevice, param: BilinearInterpParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "bilinear_interp_float")
+      super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "bilinear_interp_half")
+      super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -32,12 +32,12 @@ class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: BoxcoderParam<P>) {
+  required init(device: MTLDevice, param: BoxcoderParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "boxcoder_float")
+      super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "boxcoder_half")
+      super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
@@ -52,8 +52,8 @@ class ConcatKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: ConcatParam<P>) {
+  required init(device: MTLDevice, param: ConcatParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision)
    let orank = param.output.tensorDim.cout()
    let num = param.input.count
    assert(num <= 6)
@@ -133,16 +133,16 @@ class ConcatKernel<P: PrecisionType>: Kernel, Computable{
      }
    }
    pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float")
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half")
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half", initContext: initContext)
    } else {
      fatalError()
    }
  }
-  required init(device: MTLDevice, testParam: ConcatTestParam) {
+  required init(device: MTLDevice, testParam: ConcatTestParam, initContext: InitContext) {
-    super.init(device: device, inFunctionName: "concat")
+    super.init(device: device, inFunctionName: "concat", initContext: initContext)
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -16,99 +16,99 @@ import Foundation
 class ConvAddAddPreluKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>) {
+  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
      if param.filter.width == 1 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.channel == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.width == 3 && param.filter.height == 3 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.width == 1 && param.filter.height == 5 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.width == 5 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
        }
      } else {
        fatalError(" unsupport yet ")
      }
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
      if param.filter.width == 1 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.channel == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.width == 3 && param.filter.height == 3 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.width == 1 && param.filter.height == 5 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.width == 5 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
        }
      } else {
        fatalError(" unsupport yet ")

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -37,44 +37,44 @@ struct ConvAddBatchNormReluTestParam: TestParam {
 }
 class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
+  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) {
    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
    } else if testParam.filterSize.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
    } else {
-      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
    }
  }
  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
+  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
    param.variance.initBuffer(device: device, precision: .Float32)
    param.mean.initBuffer(device: device, precision: .Float32)
    param.scale.initBuffer(device: device, precision: .Float32)
    param.bias.initBuffer(device: device, precision: .Float32)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext)
      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext)
      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext)
      } else {
        fatalError(" unsupport ")
      }
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half")
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext)
      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half")
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext)
      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half")
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext)
      } else {
        fatalError(" unsupport ")
      }
@@ -120,10 +120,10 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
    var newBiaseBuffer: MTLBuffer
    var newScaleBuffer: MTLBuffer
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -16,36 +16,37 @@ import Foundation
 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddParam<P>) {
+  required init(device: MTLDevice, param: ConvAddParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: computePrecision)
+    let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1])
-    param.y.initBuffer(device: device, precision: computePrecision)
+    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC)
+    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x1_half")
+        super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext)
-      } else if param.filter.channel == 1 {
+      } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext)
      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_3x3_half")
+        super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext)
      } else if param.filter.width == 1 && param.filter.height == 5 {
-        super.init(device: device, inFunctionName: "conv_add_5x1_half")
+        super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext)
      } else if param.filter.width == 5 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x5_half")
+        super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext)
      } else {
        fatalError(" unsupport yet ")
      }
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x1")
+        super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext)
-      } else if param.filter.channel == 1 {
+      } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] {
-        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext)
      } else if param.filter.width == 1 && param.filter.height == 5 {
-        super.init(device: device, inFunctionName: "conv_add_5x1")
+        super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext)
      } else if param.filter.width == 5 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_add_1x5")
+        super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext)
      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_add_3x3")
+        super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext)
      } else {
        fatalError(" unsupport yet ")
      }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
@@ -16,99 +16,99 @@ import Foundation
 class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvAddPreluParam<P>) {
+  required init(device: MTLDevice, param: ConvAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.y.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
      if param.filter.width == 1 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.channel == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.width == 3 && param.filter.height == 3 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.width == 1 && param.filter.height == 5 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext)
        }
      } else if param.filter.width == 5 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext)
        }
      } else {
        fatalError(" unsupport yet ")
      }
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
      if param.filter.width == 1 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.channel == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.width == 3 && param.filter.height == 3 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.width == 1 && param.filter.height == 5 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext)
        }
      } else if param.filter.width == 5 && param.filter.height == 1 {
        if param.mode == "channel" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext)
        } else if param.mode == "element" {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext)
        } else {
-          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext)
        }
      } else {
        fatalError(" unsupport yet ")

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -38,44 +38,44 @@ struct ConvBNReluTestParam: TestParam {
 }
 class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-  required init(device: MTLDevice, testParam: ConvBNReluTestParam) {
+  required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) {
    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
    } else if testParam.filterSize.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
    } else {
-      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
    }
  }
  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvBNReluParam<P>) {
+  required init(device: MTLDevice, param: ConvBNReluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
    param.variance.initBuffer(device: device, precision: .Float32)
    param.mean.initBuffer(device: device, precision: .Float32)
    param.scale.initBuffer(device: device, precision: .Float32)
    param.bias.initBuffer(device: device, precision: .Float32)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext)
      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext)
      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext)
      } else {
        fatalError(" unsupport ")
      }
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      if param.filter.width == 1 && param.filter.height == 1 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext)
      } else if param.filter.channel == 1 {
-        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext)
      } else if param.filter.width == 3 && param.filter.height == 3 {
-        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext)
      } else {
        fatalError(" unsupport ")
      }
@@ -122,10 +122,10 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
    var newBiaseBuffer: MTLBuffer
    var newScaleBuffer: MTLBuffer
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -26,14 +26,14 @@ public struct MetalConvParam {
 class ConvKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: MetalConvParam!
-  required init(device: MTLDevice, param: ConvParam<P>) {
+  required init(device: MTLDevice, param: ConvParam<P>, initContext: InitContext) {
    param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
    if param.filter.width == 1 && param.filter.height == 1 {
-      super.init(device: device, inFunctionName: "conv_1x1")
+      super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext)
    } else if param.filter.channel == 1 {
-      super.init(device: device, inFunctionName: "depthwise_conv_3x3")
+      super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext)
    } else if param.filter.width == 3 && param.filter.height == 3 {
-      super.init(device: device, inFunctionName: "conv_3x3")
+      super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext)
    } else {
      fatalError(" unsupport ")
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
@@ -30,18 +30,18 @@ struct MetalConvTransposeParam {
 class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
  var metalParam: MetalConvTransposeParam!
-  required init(device: MTLDevice, param: ConvTransposeParam<P>) {
+  required init(device: MTLDevice, param: ConvTransposeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true)
+    param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
      if param.stride == [2, 2] && param.stride == [2, 2] {
-        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2")
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext)
      } else {
        fatalError(" -- conv transpose unsupported yet -- ")
      }
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      if param.stride == [2, 2] && param.stride == [2, 2] {
-        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half")
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext)
      } else {
        fatalError(" -- conv transpose unsupported yet -- ")
      }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -26,8 +26,8 @@ struct ElementwiseAddMetalParam {
 class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: ElementwiseAddMetalParam
-  required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+  required init(device: MTLDevice, param: ElementwiseAddParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
    metalParam = ElementwiseAddMetalParam.init()
@@ -50,10 +50,10 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
      //      print("===> elementwise_add fast!!!")
      metalParam.fast = 1
    }
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "elementwise_add")
+      super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "elementwise_add_half")
+      super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
@@ -17,9 +17,9 @@ import Foundation
 class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: ElementwiseAddMetalParam
-  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>) {
+  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
    metalParam = ElementwiseAddMetalParam.init()
@@ -43,21 +43,21 @@ class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
      metalParam.fast = 1
    }
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_float")
+        super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext)
      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "elementwise_add_element_float")
+        super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext)
      } else {
-        super.init(device: device, inFunctionName: "elementwise_add_prelu_float")
+        super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext)
      }
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
      } else {
-        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext)
      }
    } else {
      fatalError()

--- a/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
@@ -13,76 +13,49 @@
 limitations under the License. */
 import Foundation
-import Metal
-class FetchParam<P: PrecisionType>: OpParam{
-  var output: FetchHolder
-  let input: Texture<P>
-  let scope: Scope
-  required init(opDesc: OpDesc, inScope: Scope) throws {
-    scope = inScope
-    do {
-      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = FetchHolder.init(inCapacity: input.numel(), inDim: input.tensorDim.dims)
-      scope.setOutput(output: output)
-    } catch let error {
-      throw error
-    }
-  }
-  //typealias ParamPrecisionType = P
-}
 class FetchKernel<P: PrecisionType>: Kernel, Computable {
-  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+  required init(device: MTLDevice, param: FetchParam<P>, initContext: InitContext) {
-    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-      throw PaddleMobileError.predictError(message: " encode is nil")
-    }
-    encoder.setTexture(param.input.metalTexture, index: 0)
-    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
-    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
-    encoder.endEncoding()
-  }
-  required init(device: MTLDevice, param: FetchParam<P>) {
    param.output.initBuffer(device: device)
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
      if param.input.transpose == [0, 2, 3, 1] {
-        super.init(device: device, inFunctionName: "fetch_half")
+        super.init(device: device, inFunctionName: "fetch_half", initContext: initContext)
+      } else if param.input.transpose == [0, 1, 2, 3] {
+        switch param.input.tensorDim.cout() {
+        case 1, 2:
+          super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext)
+        default:
+          fatalError(" not support ")
+        }
      } else {
-//        fatalError(" not support ")
+        fatalError(" not support ")
-        super.init(device: device, inFunctionName: "fetch_placeholder_half")
-        print(" not support ")
      }
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
      if param.input.transpose == [0, 2, 3, 1] {
-        super.init(device: device, inFunctionName: "fetch")
+        super.init(device: device, inFunctionName: "fetch_float", initContext: initContext)
+      } else if param.input.transpose == [0, 1, 2, 3] {
+        switch param.input.tensorDim.cout() {
+        case 1, 2:
+          super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext)
+        default:
+          fatalError(" not support ")
+        }
      } else {
-        print(" not support ")
+        fatalError(" not support ")
-        super.init(device: device, inFunctionName: "fetch_placeholder")
-//        fatalError(" not support ")        
      }
    } else {
      fatalError(" not support ")
    }
  }
-}
-class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
-  typealias OpType = FetchOp<P>
+  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-  func inferShape() {
+      throw PaddleMobileError.predictError(message: " encode is nil")
-    print(para.input.dim)
-  }
-  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
-    do {
-      try kernel.compute(commandBuffer: buffer, param: para)
-    } catch let error {
-      throw error
    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
@@ -26,8 +26,8 @@ class FlattenKernel<P: PrecisionType>: Kernel, Computable{
  var metalParam: FlattenMetalParam
-  required init(device: MTLDevice, param: FlattenParam<P>) {
+  required init(device: MTLDevice, param: FlattenParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: computePrecision)
+    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
    var id: [Int32] = [1, 1, 1, 1]
    for i in 0..<param.input.tensorDim.cout() {
      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
@@ -47,10 +47,10 @@ class FlattenKernel<P: PrecisionType>: Kernel, Computable{
    let irank = param.input.tensorDim.cout()
    let orank = param.output.tensorDim.cout()
    assert(orank == 2)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float")
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half")
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
@@ -17,16 +17,16 @@ import Foundation
 class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
  let pipline1: MTLComputePipelineState
-  required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
+  required init(device: MTLDevice, param: MulticlassNMSParam<P>, initContext: InitContext) {
    param.middleOutput.initBuffer(device: device)
    param.bboxOutput.initBuffer(device: device)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", inPaddleMobileLib: true)
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-      super.init(device: device, inFunctionName: "nms_fetch_result")
+      super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", inPaddleMobileLib: true)
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath)
-      super.init(device: device, inFunctionName: "nms_fetch_result_half")
+      super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext)
    } else {
      fatalError( " unsupport precision " )
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -26,8 +26,8 @@ struct PoolMetalParam {
 class PoolKernel<P: PrecisionType>: Kernel, Computable{
  var metalParam: PoolMetalParam
-  required init(device: MTLDevice, param: PoolParam<P>) {
+  required init(device: MTLDevice, param: PoolParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
    var poolType: Int32
    switch param.poolType {
@@ -48,10 +48,10 @@ class PoolKernel<P: PrecisionType>: Kernel, Computable{
      poolType: poolType
    )
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "pool")
+      super.init(device: device, inFunctionName: "pool_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "pool_half")
+      super.init(device: device, inFunctionName: "pool_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
@@ -15,24 +15,24 @@
 import Foundation
 class PreluKernel<P: PrecisionType>: Kernel, Computable{
-  required init(device: MTLDevice, param: PreluParam<P>) {
+  required init(device: MTLDevice, param: PreluParam<P>, initContext: InitContext) {
-    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision)
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "prelu_channel")
+        super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext)
      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "prelu_element")
+        super.init(device: device, inFunctionName: "prelu_element", initContext: initContext)
      } else {
-        super.init(device: device, inFunctionName: "prelu_other")
+        super.init(device: device, inFunctionName: "prelu_other", initContext: initContext)
      }
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      if param.mode == "channel" {
-        super.init(device: device, inFunctionName: "prelu_channel_half")
+        super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext)
      } else if param.mode == "element" {
-        super.init(device: device, inFunctionName: "prelu_element_half")
+        super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext)
      } else {
-        super.init(device: device, inFunctionName: "prelu_other_half")
+        super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext)
      }
    } else {
      fatalError()

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -32,29 +32,28 @@ struct PriorBoxMetalParam {
 class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
  var metalParam: PriorBoxMetalParam!
-  required init(device: MTLDevice, param: PriorBoxParam<P>) {
+  required init(device: MTLDevice, param: PriorBoxParam<P>, initContext: InitContext) {
    let originDim = param.output.tensorDim;
    param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
    param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
-    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision)
-    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision)
+    if GlobalConfig.shared.computePrecision == .Float32 {
-    if computePrecision == .Float32 {
      if param.min_max_aspect_ratios_order {
-        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder")
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext)
      } else {
-        super.init(device: device, inFunctionName: "prior_box")
+        super.init(device: device, inFunctionName: "prior_box", initContext: initContext)
      }
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
      if param.min_max_aspect_ratios_order {
-        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half")
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext)
      } else {
-        super.init(device: device, inFunctionName: "prior_box_half")
+        super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext)
      }
    } else {
      fatalError()
@@ -105,12 +104,12 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
      }
    }
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
      let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
      float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
      param.newAspectRatios = buffer
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
      let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
      param.newAspectRatios = buffer
    } else {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
@@ -25,11 +25,11 @@ class ReluKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: ReluParam<P>) {
+  required init(device: MTLDevice, param: ReluParam<P>, initContext: InitContext) {
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "relu")
+      super.init(device: device, inFunctionName: "relu", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "relu_half")
+      super.init(device: device, inFunctionName: "relu_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -31,8 +31,8 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
  var metalParam: ReshapeMetalParam
-  required init(device: MTLDevice, param: ReshapeParam<P>) {
+  required init(device: MTLDevice, param: ReshapeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: computePrecision)
+    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
    var id: [Int32] = [1, 1, 1, 1]
    for i in 0..<param.input.tensorDim.cout() {
      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
@@ -51,23 +51,23 @@ class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
    )
    let irank = param.input.tensorDim.cout()
    let orank = param.output.tensorDim.cout()
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float")
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half")
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half", initContext: initContext)
    } else {
      fatalError()
    }
  }
-  required init(device: MTLDevice, testParam: ReshapeTestParam) {
+  required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) {
    metalParam = ReshapeMetalParam.init(
    idim: (0, 0, 0, 0),
    itrans: (0, 0, 0, 0),
    odim: (0, 0, 0, 0),
    otrans: (0, 0, 0, 0)
    )
-    super.init(device: device, inFunctionName: "reshape")
+    super.init(device: device, inFunctionName: "reshape", initContext: initContext)
  }
  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
@@ -20,6 +20,17 @@ struct ResizeBilinearMetalParam {
 }
 class ResizeBilinearKernel<P: PrecisionType>: Kernel, Computable{
+  required init(device: MTLDevice, param: ResizeBilinearParam<P>, initContext: InitContext) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
+    if GlobalConfig.shared.computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext)
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext)
+    } else {
+      fatalError()
+    }
+  }
  func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
      throw PaddleMobileError.predictError(message: " encode is nil")
@@ -35,15 +46,6 @@ class ResizeBilinearKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: ResizeBilinearParam<P>) {
-    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
-    if computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "resize_bilinear")
-    } else if computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "resize_bilinear_half")
-    } else {
-      fatalError()
-    }
-  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+class ScaleKernel: CusomKernel {
+  init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) {
+    if GlobalConfig.shared.computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath)
+    } else {
+      fatalError(" unsupport ")
+    }
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
@@ -28,12 +28,12 @@ class ShapeKernel<P: PrecisionType>: Kernel, Computable{
 //    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: ShapeParam<P>) {
+  required init(device: MTLDevice, param: ShapeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: computePrecision)
+    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "shape")
+      super.init(device: device, inFunctionName: "shape", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "shape_half")
+      super.init(device: device, inFunctionName: "shape_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -22,16 +22,16 @@ struct SoftmaxMetalParam {
 class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
  var metalParam: SoftmaxMetalParam
-  required init(device: MTLDevice, param: SoftmaxParam<P>) {
+  required init(device: MTLDevice, param: SoftmaxParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: computePrecision)
+    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
    metalParam = SoftmaxMetalParam.init(
      N: Int32(param.input.tensorDim[0]),
      K: Int32(param.input.tensorDim[1])
    )
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "softmax_float")
+      super.init(device: device, inFunctionName: "softmax_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "softmax_half")
+      super.init(device: device, inFunctionName: "softmax_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
@@ -37,13 +37,13 @@ class SplitKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: SplitParam<P>) {
+  required init(device: MTLDevice, param: SplitParam<P>, initContext: InitContext) {
    //     param.output.initTexture(device: device, computePrecision: computePrecision)
    let num = param.outputList.count
    let rank = param.input.tensorDim.cout()
    assert(num >= 2 && num <= 4)
    for output in param.outputList {
-      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
    }
    smp = SplitMetalParam.init()
    smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
@@ -81,10 +81,10 @@ class SplitKernel<P: PrecisionType>: Kernel, Computable{
    if v == "normal" {
      fatalError("split unsupported")
    }
-    if computePrecision == .Float32 {
+    if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float")
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float", initContext: initContext)
-    } else if computePrecision == .Float16 {
+    } else if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half")
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -33,12 +33,12 @@ class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
    encoder.endEncoding()
  }
-  required init(device: MTLDevice, param: FeedParam<P>) {
+  required init(device: MTLDevice, param: FeedParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision)
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
-      super.init(device: device, inFunctionName: "texture2d_to_2d_array_half")
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext)
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
-      super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext)
    } else {
      fatalError()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -22,8 +22,8 @@ struct TransposeMetalParam {
 class TransposeKernel<P: PrecisionType>: Kernel, Computable {
  var metalParam: TransposeMetalParam = TransposeMetalParam.init()
-  required init(device: MTLDevice, param: TransposeParam<P>) {
+  required init(device: MTLDevice, param: TransposeParam<P>, initContext: InitContext) {
-    param.output.initTexture(device: device, computePrecision: computePrecision)
+    param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision)
    let rank = param.input.tensorDim.cout()
    var axis: [Int] = [0, 1, 2, 3]
    for i in 0..<param.axis.count {
@@ -43,13 +43,13 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable {
    metalParam.oC = Int32(param.output.dim[3])
    metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
    var kernelFunc = "transpose_undefined"
-    if computePrecision == .Float16 {
+    if GlobalConfig.shared.computePrecision == .Float16 {
      if param.input.transpose == axis {
        kernelFunc = "transpose_copy_half"
      } else {
        kernelFunc = "transpose_\(rank)_half"
      }
-    } else if computePrecision == .Float32 {
+    } else if GlobalConfig.shared.computePrecision == .Float32 {
      if param.input.transpose == axis {
        kernelFunc = "transpose_copy_float"
      } else {
@@ -60,7 +60,7 @@ class TransposeKernel<P: PrecisionType>: Kernel, Computable {
    }
    print("===========>", kernelFunc)
    print(metalParam)
-    super.init(device: device, inFunctionName: kernelFunc)
+    super.init(device: device, inFunctionName: kernelFunc, initContext: initContext)
  }
  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      const device float4 * nscale [[buffer(0)]],
+                      const device float4 * nbias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  float4 output = input * nscale[gid.z] + nbias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      const device half4 * newScale [[buffer(0)]],
+                      const device half4 * newBias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  half4 output = input * newScale[gid.z] + newBias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormRelu.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormRelu.metal
+//
+//  BatchNormRelu.metal
+//  paddle-mobile
+//
+#include <metal_stdlib>
+using namespace metal;
+struct MetalConvParam {
+    short offsetX;
+    short offsetY;
+    short offsetZ;
+    ushort strideX;
+    ushort strideY;
+};
+kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         const device float4 *new_scale [[buffer(0)]],
+                                         const device float4 *new_biase [[buffer(1)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    float4 input;
+    float4 output;
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    input = inTexture.sample(sample, gid.x, gid.y, gid.z);
+    output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                     texture2d_array<P, access::write> output [[texture(1)]],
+                     constant bilinear_interp_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    P w = gid.x * pm.ratio_w;
+    P h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    P w1lambda = w - w0, h1lambda = h - h0;
+    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
+    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
+    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
+    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+      + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+struct bilinear_interp_param {
+  float ratio_h;
+  float ratio_w;
+};
+#define P float
+#include "BilinearInterp.inc.metal"
+#undef P
+#define P half
+#include "BilinearInterp.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
+                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
+                     texture2d_array<P, access::read> targetBox [[texture(2)]],
+                     texture2d_array<P, access::write> output[[texture(3)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) t;
+  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
+  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
+  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
+  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
+  P px = (p.x + p.z) / 2;
+  P py = (p.y + p.w) / 2;
+  P pw = p.z - p.x;
+  P ph = p.w - p.y;
+  P tx = pv.x * t.x * pw + px;
+  P ty = pv.y * t.y * ph + py;
+  P tw = exp(pv.z * t.z) * pw;
+  P th = exp(pv.w * t.w) * ph;
+  VECTOR(P, 4) r;
+  r.x = tx - tw / 2;
+  r.y = ty - th / 2;
+  r.z = tx + tw / 2;
+  r.w = ty + th / 2;
+  output.write(r, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+#define P float
+#include "BoxCoder.inc.metal"
+#undef P
+#define P half
+#include "BoxCoder.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Common.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Common.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = abcd[2] = 0;
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = 0;
+  abcd[2] = xyzn[1];
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
+  abcd[0] = 0;
+  abcd[3] = xyzn[0];
+  abcd[2] = xyzn[1];
+  abcd[1] = xyzn[2] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
+  xyzn[1] = xyzn[2] = 0;
+  xyzn[0] = abcd[3] / 4;
+  xyzn[1] = abcd[3] % 4;
+}
+inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
+  xyzn[2] = 0;
+  xyzn[1] = abcd[2];
+  xyzn[0] = abcd[3] / 4;
+  xyzn[3] = abcd[3] % 4;
+}
+inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[3];
+  xyzn[1] = abcd[2];
+  xyzn[2] = abcd[1] / 4;
+  xyzn[3] = abcd[1] % 4;
+}
+inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
+  int32_t r = abcd[0];
+  r = r * dim[1] + abcd[1];
+  r = r * dim[2] + abcd[2];
+  r = r * dim[3] + abcd[3];
+  return r;
+}
+inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
+  abcd[3] = ind % dim[3]; ind /= dim[3];
+  abcd[2] = ind % dim[2]; ind /= dim[2];
+  abcd[1] = ind % dim[1]; ind /= dim[1];
+  abcd[0] = ind;
+}
+inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[i] = ipos[trans[i]];
+  }
+}
+inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[trans[i]] = ipos[i];
+  }
+}
+struct MetalConvParam {
+  short offsetX;
+  short offsetY;
+  short offsetZ;
+  ushort strideX;
+  ushort strideY;
+  ushort dilationX;
+  ushort dilationY;
+};
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+#if V == VNORMAL
+//kernel void FUNC(concat, R, N, normal, P)(array<texture2d_array<P, access::read>, N> in [[texture(0)]],
+//                                     texture2d_array<P, access::read> out_x [[texture(N)]],
+//                                     texture2d_array<P, access::write> out [[texture(N+1)]],
+//                                     constant ConcatParam & pm [[buffer(0)]],
+//                                     uint3 gid [[thread_position_in_grid]]) {
+//}
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif
+                                          texture2d_array<P, access::read> inx [[texture(N)]],
+                                          texture2d_array<P, access::write> out [[texture(N+1)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+   ConcatParam cp = pm;
+   int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+   VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
+   for (int i = 0; i < 4; i++) {
+     xyzn[3] = i;
+#if R == 4
+     xyzn2abcd_4(cp.odim[3], xyzn, abcd);
+#else
+     FUNC_R(xyzn2abcd, R)(xyzn, abcd);
+#endif
+     int k = abcd[cp.axis] - cp.offset;
+     if (k < 0) continue;
+     int j = 0;
+     for (; j < N; j++) {
+       if (k < cp.vdim[j]) {
+         break;
+       }
+       k -= cp.vdim[j];
+     }
+     if (j == N) {
+       continue;
+     }
+     int ta = cp.odim[cp.axis];
+     abcd[cp.axis] = k;
+     cp.odim[cp.axis] = cp.vdim[j];
+#if R == 4
+     abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
+#else
+     FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
+#endif
+     cp.odim[cp.axis] = ta;
+     switch (j) {
+       case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+       case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#if N >= 3
+       case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 4
+       case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 5
+       case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 6
+       case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+     }
+   }
+   out.write(r, gid.xy, gid.z);
+}
+#endif // V == NORMAL
+#if V == VX
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                          texture2d_array<P, access::write> out [[texture(N)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+  int x = gid.x - pm.offset;
+  if (x < 0) return;
+  if (x < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= pm.vdim[0];
+  if (x < pm.vdim[1]) {
+    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  x -= pm.vdim[1];
+  if (x < pm.vdim[2]) {
+    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= pm.vdim[2];
+  if (x < pm.vdim[3]) {
+    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  x -= pm.vdim[3];
+  if (x < pm.vdim[4]) {
+    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  x -= pm.vdim[4];
+  if (x < pm.vdim[5]) {
+    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VX
+#if V == VY
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int y = gid.y - pm.offset;
+  if (y < 0) return;
+  if (y < pm.vdim[0]) {
+    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= pm.vdim[0];
+  if (y < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  y -= pm.vdim[1];
+  if (y < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= pm.vdim[2];
+  if (y < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  y -= pm.vdim[3];
+  if (y < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  y -= pm.vdim[4];
+  if (y < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VY
+#if V == VZ
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int z = gid.z - pm.offset;
+  if (z < 0) return;
+  if (z < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  z -= pm.vdim[0];
+  if (z < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  z -= pm.vdim[1];
+  if (z < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  z -= pm.vdim[2];
+  if (z < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  z -= pm.vdim[3];
+  if (z < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  z -= pm.vdim[4];
+  if (z < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VZ
+#undef VV
+#endif // #ifdef P
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+struct ConcatParam {
+  int32_t odim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[6];
+};
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+// >> fast mode
+// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half}
+// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half}
+// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half}
+// >> normal mode (loop mode)
+// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x)
+// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y)
+// genet: (R=4, N=2, V=normal)
+// ssd-ar: (R=3, N=5, V=x)
+#define V VX
+  #define R 3
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+// ssd-ar: (R=2, N=5, V=x)
+#define V VX
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+// ssd-ar: (R=4, N=3, V=z)
+#define V VZ
+  #define R 4
+    #define N 3
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+// ssd: (R=2, N=6, V=y)
+#define V VY
+  #define R 2
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+// ssd: (R=3, N=6, V=y)
+#define V VY
+  #define R 3
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+#define V VNORMAL
+  #define R 4
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+#define V VY
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddBNReluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+kernel void conv_add_batch_norm_relu_1x1_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+kernel void conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+/*---------------------------------------------*/
+kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device float *weights [[buffer(1)]],
+                                                   const device float4 *biase [[buffer(2)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddMetal.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddMetal.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+#pragma mark - convAdd
+kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = biase[gid.z];
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = biase[gid.z];
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 5;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = biase[gid.z];
+  ushort dilation_y = param.dilationY;
+  float4 input[5];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 5;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = biase[gid.z];
+  ushort dilation_x = param.dilationX;
+  float4 input[5];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device float *weights [[buffer(1)]],
+                                   const device float4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = biase[gid.z];
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+#pragma mark - half
+kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  half4 output = biase[gid.z];
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  half4 output = biase[gid.z];
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device half *weights [[buffer(1)]],
+                                   const device half4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  half4 output = biase[gid.z];
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 5;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  half4 output = biase[gid.z];
+  ushort dilation_y = param.dilationY;
+  half4 input[5];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 5;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  half4 output = biase[gid.z];
+  ushort dilation_x = param.dilationX;
+  half4 input[5];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<float, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device float4 *weights [[buffer(1)]],
+                              const device float4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  //  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#include "Macro.metal"
+#pragma mark - convAdd
+kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  VECTOR(P, 4) output = biase[gid.z];
+  VECTOR(P, 4) input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
+    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + float4(biase[gid.z]);
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device VECTOR(P, 4) *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+     const device P *alpha [[buffer(3)]],
+#endif
+     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  VECTOR(P, 4) output = biase[gid.z];
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  VECTOR(P, 4) input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                        const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 5;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  VECTOR(P, 4) output = biase[gid.z];;
+  ushort dilation_y = param.dilationY;
+  VECTOR(P, 4) input[5];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 5;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  VECTOR(P, 4) output = biase[gid.z];
+  ushort dilation_x = param.dilationX;
+  VECTOR(P, 4) input[5];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device P *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+    const device P *alpha [[buffer(3)]],
+#endif
+    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  VECTOR(P, 4) output = biase[gid.z];
+  VECTOR(P, 4) inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    VECTOR(P, 4) input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+#define P float
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+#undef P
+#define P half
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvBNReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvBNReluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+#pragma mark - conv bn relu
+kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device float *weights [[buffer(1)]],
+                                               const device float4 *new_scale [[buffer(2)]],
+                                               const device float4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+#pragma mark - half
+kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device half *weights [[buffer(1)]],
+                                               const device half4 *new_scale [[buffer(2)]],
+                                               const device half4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+// conv
+#pragma mark -- conv
+kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device float *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device half *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+    output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvTransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvTransposeKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+struct MetalConvTransposeParam{
+  ushort kernelW;
+  ushort kernelH;
+  ushort strideX;
+  ushort strideY;
+  ushort paddingX;
+  ushort paddingY;
+  ushort dilationX;
+  ushort dilationY;
+};
+kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device float4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    output.x += dot(input, kernel_slice0);
+    output.y += dot(input, kernel_slice1);
+    output.z += dot(input, kernel_slice2);
+    output.w += dot(input, kernel_slice3);
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device half4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    output.x += dot(float4(input), float4(kernel_slice0));
+    output.y += dot(float4(input), float4(kernel_slice1));
+    output.z += dot(float4(input), float4(kernel_slice2));
+    output.w += dot(float4(input), float4(kernel_slice3));
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+//                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+//                           constant MetalConvTransposeParam &param [[buffer(0)]],
+//                           const device float4 *weights [[buffer(1)]],
+//                           uint3 gid [[thread_position_in_grid]]){
+//  if (gid.x >= outTexture.get_width() ||
+//      gid.y >= outTexture.get_height() ||
+//      gid.z >= outTexture.get_array_size()) {
+//    return;
+//  }
+//
+//  int input_array_size = inTexture.get_array_size();
+//
+//  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
+//
+//  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+//
+//  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+//
+//  float4 output;
+//
+//  for (int w = 0; w < param.kernelW; ++w) {
+//    int top = gid.x - w * param.dilationX + param.paddingX;
+//    int input_x = top / param.strideX;
+//    if (top < 0 || input_x >= int(inTexture.get_width())) {
+//      continue;
+//    }
+//
+//    for (int h = 0; h < param.kernelH; ++h) {
+//      int top_y = gid.y - h * param.dilationY + param.paddingY;
+//      int input_y = top_y / param.strideY;
+//      if (top_y < 0 || input_y >= int(inTexture.get_height())) {
+//        continue;
+//      }
+//
+//      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
+//
+//      for (int slice = 0; slice < input_array_size; ++slice) {
+//
+//        float4 input;
+//        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
+//        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
+//
+//        input = inTexture.sample(sample, float2(input_x,    input_y), slice);
+//        output.x += dot(input, kernel_slice);
+//        output.y += dot(input, kernel_slice1);
+//        output.z += dot(input, kernel_slice2);
+//        output.w += dot(input, kernel_slice3);
+//      }
+//    }
+//  }
+//
+//  outTexture.write(output, gid.xy, gid.z);
+//}
+//
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Elementwise.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Elementwise.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
+                            texture2d_array<float, access::read> inputY [[texture(1)]],
+                            texture2d_array<float, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  float4 rx, ry;
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  float4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
+kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
+                            texture2d_array<half, access::read> inputY [[texture(1)]],
+                            texture2d_array<half, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  half4 rx, ry;
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  half4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#include <metal_stdlib>
+#include "Macro.metal"
+using namespace metal;
+kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
+                                 texture2d_array<P, access::read> inputY [[texture(1)]],
+                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
+                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+#ifdef PRELU_CHANNEL
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_ELEMENT
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_OTHER
+                                 const device P *alpha [[buffer(1)]],
+#endif
+                                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  VECTOR(P, 4) rx, ry;
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+    } else {
+      rx = inputX.read(gid.xy, gid.z);
+      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+      int32_t yshift = 4 - pm.ylen - pm.axis;
+      for (int n = 0; n < 4; n++) {
+        x_xyzn[3] = n;
+        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+        invtrans(xtrans, x_abcd, t_abcd);
+        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+          y_abcd[yshift+k] = t_abcd[k];
+        }
+        trans(ytrans, y_abcd, t_abcd);
+        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+      }
+  }
+  VECTOR(P, 4) output = rx + ry;
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(output, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+#define P float
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+#undef P
+#define P half
+#define PRELU_CHANNEL channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
@@ -12,34 +12,40 @@
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <metal_stdlib>
+#ifdef P
-using namespace metal;
-kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
-                       device float *output [[buffer(0)]],
+#define CONCAT2_(a, b) a ## _ ## b
-                      uint3 gid [[thread_position_in_grid]]) {
+#define CONCAT2(a, b) a ## b
+#define FUNC(m, n, q) CONCAT3_(m, n, q)
+#define FUNC_T(m, n) CONCAT2_(m, n)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                  device float *output [[buffer(0)]],
+                  uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= inTexture.get_width() ||
      gid.y >= inTexture.get_height() ||
      gid.z >= inTexture.get_array_size()) {
    return;
  }
  int input_width = inTexture.get_width();
  int input_height = inTexture.get_height();
-  const float4 input = inTexture.read(gid.xy, gid.z);
+  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
  int output_to = 4 * input_width * input_height;
  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
 }
+kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                             device float4 *output [[buffer(0)]],
-                  device float * output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
-                  uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= inTexture.get_width() ||
      gid.y >= inTexture.get_height() ||
      gid.z >= inTexture.get_array_size()) {
@@ -47,25 +53,9 @@ kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0
  }
  int input_width = inTexture.get_width();
-  int input_height = inTexture.get_height();
+  const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = float4(input);
-  int output_to = 4 * input_width * input_height;
-  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
-  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
-  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
-  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
-}
-kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                              device float *output [[buffer(0)]],
-                              uint3 gid [[thread_position_in_grid]]) {
-}
-kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
-                              device float *output [[buffer(0)]],
-                              uint3 gid [[thread_position_in_grid]]) {
 }
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+#define P float
+#include "FetchKernel.inc.metal"
+#undef P
+#define P half
+#include "FetchKernel.inc.metal"
+#undef P
+kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+}
+kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                                   device float *output [[buffer(0)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Kernels.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+// 占位函数, 啥也没干
+kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+}
+struct OutputDim {
+  ushort width;
+  ushort height;
+  ushort strideX;
+  ushort strideY;
+};
+kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
+                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                   constant OutputDim &params [[buffer(0)]],
+                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
+  const half4 input = inTexture.read(pos);
+  outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
+}
+kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
+                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const float4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
+kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const half4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Macro.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Macro.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC2_(a, b) CONCAT2_(a, b)
+#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/NMSFetchResultKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/NMSFetchResultKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  int input_width = inTexture.get_width();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+}
+kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  int input_width = inTexture.get_width();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+}
+kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float4 *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input;
+}
+kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           device float4 *output [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = float4(input);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
@@ -12,22 +12,10 @@
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <metal_stdlib>
+#ifdef P
-#include "Common.metal"
-using namespace metal;
-struct PoolParam {
+kernel void FUNC2_(pool, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
-  int ksizeX;
+                 texture2d_array<P, access::write> outTexture [[texture(1)]],
-  int ksizeY;
-  int strideX;
-  int strideY;
-  int paddingX;
-  int paddingY;
-  int poolType;
-};
-kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
-                 texture2d_array<float, access::write> outTexture [[texture(1)]],
                 constant PoolParam &pm [[buffer(0)]],
                 uint3 gid [[thread_position_in_grid]]) {
  if (gid.x >= outTexture.get_width() ||
@@ -40,7 +28,7 @@ kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
  ymin = max(ymin, 0);
-  float4 r = 0;
+  VECTOR(P, 4) r = 0;
  if (pm.poolType == 0) {
    r = inTexture.read(uint2(xmin, ymin), gid.z);
    for (int x = xmin; x < xmax; x++) {
@@ -54,40 +42,9 @@ kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
        r += inTexture.read(uint2(x, y), gid.z);
      }
    }
-    r /= pm.ksizeX * pm.ksizeY;
+    r /= (xmax - xmin) * (ymax - ymin);
  }
  outTexture.write(r, gid.xy, gid.z);
 }
-kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+#endif
-                      texture2d_array<half, access::write> outTexture [[texture(1)]],
-                      constant PoolParam &pm [[buffer(0)]],
-                      uint3 gid [[thread_position_in_grid]]) {
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) return;
-  int xmin = gid.x * pm.strideX - pm.paddingX;
-  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
-  xmin = max(xmin, 0);
-  int ymin = gid.y * pm.strideX - pm.paddingX;
-  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
-  ymin = max(ymin, 0);
-  half4 r = 0;
-  if (pm.poolType == 0) {
-    r = inTexture.read(uint2(xmin, ymin), gid.z);
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
-      }
-    }
-  } else if (pm.poolType == 1) {
-    for (int x = xmin; x < xmax; x++) {
-      for (int y = ymin; y < ymax; y++) {
-        r += inTexture.read(uint2(x, y), gid.z);
-      }
-    }
-    r /= pm.ksizeX * pm.ksizeY;
-  }
-  outTexture.write(r, gid.xy, gid.z);
-}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Macro.metal"
+using namespace metal;
+struct PoolParam {
+  int ksizeX;
+  int ksizeY;
+  int strideX;
+  int strideY;
+  int paddingX;
+  int paddingY;
+  int poolType;
+};
+#define P half
+#include "PoolKernel.inc.metal"
+#undef P
+#define P float
+#include "PoolKernel.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           const device float4 *alpha [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float4 alpha_value = alpha[gid.z];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  float4 alpha_value = alpha[alpha_to + gid.z];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float alpha_value = alpha[0];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half4 alpha_value = alpha[gid.z];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  half4 alpha_value = alpha[alpha_to + gid.z];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                        const device half *alpha [[buffer(0)]],
+                        uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half alpha_value = alpha[0];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PriorBoxKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PriorBoxKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+struct PriorBoxMetalParam {
+  float offset;
+  float stepWidth;
+  float stepHeight;
+  float minSize;
+  float maxSize;
+  float imageWidth;
+  float imageHeight;
+  bool clip;
+  uint numPriors;
+  uint aspecRatiosSize;
+  uint minSizeSize;
+  uint maxSizeSize;
+};
+kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  float box_width, box_height;
+  if (gid.z < param.aspecRatiosSize) {
+    float ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    outBoxTexture.write(res, gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(max_box, gid.xy, gid.z);
+    }
+  }
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                      const device half *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  float box_width, box_height;
+  if (gid.z < param.aspecRatiosSize) {
+    half ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+    }
+  }
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  float box_width, box_height;
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
+    int skip = 0;
+    for (int i = 0; i < aspect_to + 1; ++i) {
+      if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
+        skip += 1;
+      }
+    }
+    aspect_to += skip;
+    float ar = aspect_ratios[aspect_to];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                           const device half *aspect_ratios [[buffer(0)]],
+                           constant PriorBoxMetalParam &param [[buffer(1)]],
+                           const device float4 *variances [[buffer(2)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  float box_width, box_height;
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
+    float ar = aspect_ratios[aspect_to];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                 texture2d_array<half, access::write> outTexture [[texture(1)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(half4(relu), gid.xy, gid.z);
+}
+kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(float4(relu), gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant ReshapeParam &rp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
+  ReshapeParam lrp = rp;
+  int oC = lrp.odim[lrp.otrans[3]];
+  int iC = lrp.idim[lrp.itrans[3]];
+  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
+  VECTOR(P, 4) r;
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if ROUT == 4
+    xyzn2abcd_4(oC, oxyzn, oabcd);
+#else
+    FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
+#endif
+    int tabcd[4];
+    invtrans(lrp.otrans, oabcd, tabcd);
+    int index = abcd2index(lrp.odim, tabcd);
+    if (index < count) {
+      index2abcd(lrp.idim, index, tabcd);
+      trans(lrp.itrans, tabcd, iabcd);
+#if RIN == 4
+      abcd2xyzn_4(iC, iabcd, ixyzn);
+#else
+      FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
+#endif
+      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+    } else {
+      r[n] = 0;
+    }
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONRITIONS OF ANY KINR, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+struct ReshapeParam {
+  int32_t idim[4];
+  int32_t itrans[4];
+  int32_t odim[4];
+  int32_t otrans[4];
+};
+#define P float
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#undef P
+#define P half
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ResizeBilinear.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ResizeBilinear.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+struct resize_bilinear_param {
+//  int32_t out_h;
+//  int32_t out_w;
+  float ratio_h;
+  float ratio_w;
+};
+kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
+                     texture2d_array<float, access::write> output [[texture(2)]],
+                     constant resize_bilinear_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  float4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    float w = gid.x * pm.ratio_w;
+    float h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    float w1lambda = w - w0, h1lambda = h - h0;
+    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    float4 r0 = input.read(uint2(w0, h0), gid.z);
+    float4 r1 = input.read(uint2(w1, h0), gid.z);
+    float4 r2 = input.read(uint2(w0, h1), gid.z);
+    float4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
+                            texture2d_array<half, access::write> output [[texture(2)]],
+                            constant resize_bilinear_param & pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  half4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    half w = gid.x * pm.ratio_w;
+    half h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    half w1lambda = w - w0, h1lambda = h - h0;
+    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    half4 r0 = input.read(uint2(w0, h0), gid.z);
+    half4 r1 = input.read(uint2(w1, h0), gid.z);
+    half4 r2 = input.read(uint2(w0, h1), gid.z);
+    half4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+  output.write(r, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Scale.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Scale.metal
+//
+//  Scale.metal
+//  paddle-mobile
+//
+//  Created by liuRuiLong on 2019/1/4.
+//  Copyright © 2019 orange. All rights reserved.
+//
+#include <metal_stdlib>
+using namespace metal;
+kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(input, gid);
+}
+kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(half4(input), gid);
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Shape.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Shape.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+kernel void shape() {
+}
+kernel void shape_half() {
+}
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant SoftmaxParam &sp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+//  int zsize = inTexture.get_array_size();
+  P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
+  int group = sp.K / 4;
+  int remain = sp.K % 4;
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
+  }
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      maxv = max(maxv, r[i]);
+    }
+  }
+  VECTOR(P, 4) rsum = {0, 0, 0, 0};
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    rsum += exp(r - maxv);
+  }
+  P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      sum += exp(r[i] - maxv);
+    }
+  }
+  VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
+  rr = exp(rr - maxv) / sum;
+  outTexture.write(rr, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+using namespace metal;
+struct SoftmaxParam {
+  int N;
+  int K;
+};
+#define P float
+#include "Softmax.inc.metal"
+#undef P
+#define P half
+#include "Softmax.inc.metal"
+#undef P
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+#if V == VY
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                 texture2d_array<P, access::write> out1 [[texture(1)]],
+                                 texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                 texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                 texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                 constant SplitParam &sp [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int y = gid.y - sp.offset;
+  if (y < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= sp.vdim[0];
+  if (y < sp.vdim[1]) {
+    out2.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#if N >= 3
+  y -= sp.vdim[1];
+  if (y < sp.vdim[2]) {
+    out3.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= sp.vdim[2];
+  if (y < sp.vdim[3]) {
+    out4.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VY
+#if V == VX
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                     texture2d_array<P, access::write> out1 [[texture(1)]],
+                                     texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                     texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                     texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                     constant SplitParam &sp [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int x = gid.x;
+  if (x < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= sp.vdim[0];
+  if (x < sp.vdim[1]) {
+    out2.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#if N >= 3
+  x -= sp.vdim[1];
+  if (x < sp.vdim[2]) {
+    out3.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= sp.vdim[2];
+  if (x < sp.vdim[3]) {
+    out4.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VX
+#undef VV
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+struct SplitParam {
+  int32_t idim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[4];
+};
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+// only support split_{2, 3, 4}_{2, 3, 4}_y_{float, half}
+// only support split_{3, 4}_{2, 3, 4}_x_{float, half}
+//// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+//// ssd-ar: (R=2, N=2, V=y)
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define FUNC(f, r, p) CONCAT3_(f, r, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                      texture2d_array<P, access::write> outTexture [[texture(1)]],
+                      constant TransposeParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
+  int iabcd[4], oabcd[4], ixyzn[4];
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if R == 4
+    xyzn2abcd_4(pm.oC, oxyzn, iabcd);
+#endif // R == 4
+#if R == 3
+    xyzn2abcd_3(oxyzn, oabcd);
+#endif // R == 3
+#if R == 2
+    xyzn2abcd_2(oxyzn, oabcd);
+#endif // R == 2
+    iabcd[pm.axis[0]] = oabcd[0];
+    iabcd[pm.axis[1]] = oabcd[1];
+    iabcd[pm.axis[2]] = oabcd[2];
+    iabcd[pm.axis[3]] = oabcd[3];
+#if R == 4
+    abcd2xyzn_4(pm.iC, iabcd, ixyzn);
+#endif // R == 4
+#if R == 3
+    abcd2xyzn_3(iabcd, ixyzn);
+#endif // R == 3
+#if R == 2
+    abcd2xyzn_2(iabcd, ixyzn);
+#endif // R == 2
+    r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+struct TransposeParam {
+  int iC;
+  int oC;
+  int axis[4];
+};
+kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+#define R 4
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+#define R 3
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+#define R 2
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
--- a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
@@ -22,24 +22,27 @@ class MulticlassNMSParam<P: PrecisionType>: OpParam {
      bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
      output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
-      middleOutput = FetchHolder.init(inCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim.dims)
+      middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim)
-      bboxOutput = FetchHolder.init(inCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim.dims)
+      bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim)
    } catch let error {
      throw error
    }
  }
  var bboxOutput: FetchHolder
  var middleOutput: FetchHolder
-  let scores: Texture<P>
+  let scores: Texture
-  let bboxes: Texture<P>
+  let bboxes: Texture
-  var output: Texture<P>
+  var output: Texture
 }
 class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
-  func inputVariant() -> [String : [Variant]] {
+  func inputVariant() -> [String : [MTLBuffer]] {
-    return ["Scores" : [para.middleOutput], "BBoxes" : [para.bboxOutput]]
+    guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else {
+      fatalError()
+    }
+    return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]]
  }
  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {

--- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
@@ -32,8 +32,8 @@ class PoolParam<P: PrecisionType>: OpParam {
    }
    //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
  var ksize: [Int32]
  var stride: [Int32]
  var padding: [Int32]

--- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
@@ -28,8 +28,8 @@ class PreluParam<P: PrecisionType>: OpParam {
  }
  let mode: String
  let alpha: Tensor<P>
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
 }
 class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{

--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -53,10 +53,10 @@ class PriorBoxParam<P: PrecisionType>: OpParam {
  var stepH: Float32
  let offset: Float32
-  let input: Texture<P>
+  let input: Texture
-  let inputImage: Texture<P>
+  let inputImage: Texture
-  var output: Texture<P>
+  var output: Texture
-  let outputVariances: Texture<P>
+  let outputVariances: Texture
 }
 class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{

--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
@@ -25,8 +25,8 @@ class ReluParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
 }
 class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, Creator, InferShaperable{
@@ -47,10 +47,11 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable,
  func delogOutput() {
    print(" \(type) output: ")
+    print(para.output.metalTexture)
    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
-    let device = para.output.metalTexture!.device
+//    let device = para.output.metalTexture!.device
-    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+//    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
-    print(outputArray.strideArray())
+//    print(outputArray.strideArray())
  }
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -48,9 +48,9 @@ class ReshapeParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  let input: Texture<P>
+  let input: Texture
  let shape: [Int32]
-  var output: Texture<P>
+  var output: Texture
 }
 class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{

--- a/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
@@ -29,8 +29,8 @@ class ResizeBilinearParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
  let out_h: Int32
  let out_w: Int32
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
@@ -24,8 +24,8 @@ class ShapeParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  var output: Texture<P>
+  var output: Texture
-  let input: Texture<P>
+  let input: Texture
 }
 class ShapeOp<P: PrecisionType>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{

--- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
@@ -32,8 +32,8 @@ class SoftmaxParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
 }
 class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, Runable, Creator, InferShaperable{

--- a/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
@@ -19,7 +19,7 @@ class SplitParam<P: PrecisionType>: OpParam {
  required init(opDesc: OpDesc, inScope: Scope) throws {
    do {
      input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
-      output = Texture<P>.init(device: input.metalTexture!.device, inDim: input.dim)
+      output = Texture.init(device: input.metalTexture!.device, inDim: input.dim)
      axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
      sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
      if axis < 0 {
@@ -29,7 +29,7 @@ class SplitParam<P: PrecisionType>: OpParam {
        fatalError()
      }
      for out in outlist {
-        guard let variant = inScope[out], let v = variant as? Texture<P> else {
+        guard let variant = inScope[out], let v = variant as? Texture else {
          fatalError()
        }
        outputList.append(v)
@@ -41,9 +41,9 @@ class SplitParam<P: PrecisionType>: OpParam {
  }
  var axis: Int
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
-  var outputList: [Texture<P>] = []
+  var outputList: [Texture] = []
  var sections: [Int32] = []
 }

--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
@@ -26,8 +26,8 @@ class TransposeParam<P: PrecisionType>: OpParam {
      throw error
    }
  }
-  let input: Texture<P>
+  let input: Texture
-  var output: Texture<P>
+  var output: Texture
  let axis: [Int32]
 }

--- a/metal/paddle-mobile/paddle-mobile/Program/Attribute.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/Attribute.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
@@ -14,10 +14,10 @@
 import Foundation
-class BlockDesc {
+public class BlockDesc {
    let index: Int
    let parentIndex: Int
-    let vars: [VarDesc]
+    public let vars: [VarDesc]
    let ops: [OpDesc]
    init(block: PaddleMobile_Framework_Proto_BlockDesc) {
        index = Int(block.idx)
@@ -45,7 +45,7 @@ class BlockDesc {
 }
 extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    var description: String {
+  public var description: String {
        var str = ""
        for i in 0..<ops.count {
@@ -61,9 +61,7 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
        return str
    }
-    var debugDescription: String {
+  public var debugDescription: String {
        return description
    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/Program.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/Program.swift
@@ -14,10 +14,10 @@
 import Foundation
-public class Program {
+@objc public class Program: NSObject {
-    let paramPath: String
+    public let paramPath: String
-    let programDesc: ProgramDesc
+    public let programDesc: ProgramDesc
-    let scope: Scope
+    public let scope: Scope
    init(inProgramDesc: ProgramDesc, inParamPath: String, inScope: Scope) {
        programDesc = inProgramDesc
        paramPath = inParamPath

--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
@@ -15,7 +15,7 @@
 import Foundation
 public class ProgramDesc {
-    var blocks: [BlockDesc] = []
+    public var blocks: [BlockDesc] = []
    init(protoProgram: PaddleMobile_Framework_Proto_ProgramDesc) {
        for block in protoProgram.blocks {
            self.blocks.append(BlockDesc.init(block: block))

--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/Scope.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/Scope.swift
@@ -14,7 +14,7 @@
 import Foundation
-class Scope {
+public class Scope {
    let feedKey: String
    let fetchKey: String
    func setInput(input: Variant) {
@@ -29,7 +29,7 @@ class Scope {
        return vars[feedKey];
    }
-    func output() -> Variant? {
+    public func output() -> Variant? {
        return vars[fetchKey];
    }
@@ -38,7 +38,7 @@ class Scope {
        fetchKey = inFetchKey
    }
-    var vars: [String : Variant] = [:]
+    public var vars: [String : Variant] = [:]
    subscript(key: String) -> Variant?{
        get {
            return vars[key]

--- a/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
--- a/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
@@ -14,7 +14,7 @@
 import Foundation
-enum VarTypeType: Int {
+public enum VarTypeType: Int {
    case ErrorType = -1,
    Bool = 0,
    Int16 = 1,
@@ -56,10 +56,10 @@ enum VarTypeType: Int {
    }
 }
-class VarDesc {
+public class VarDesc {
-    let name: String
+    public let name: String
-    let persistable: Bool
+    public let persistable: Bool
-    let type: VarTypeType
+    public let type: VarTypeType
    let tensorDesc: TensorDesc?
    init(protoVarDesc: PaddleMobile_Framework_Proto_VarDesc) {
        type = VarTypeType.init(rawValue: protoVarDesc.type.type.rawValue) ?? .ErrorType
@@ -79,7 +79,7 @@ class VarDesc {
 }
 extension VarDesc: CustomStringConvertible, CustomDebugStringConvertible {
-    var description: String {
+  public var description: String {
        var str = ""
        str += "var name \(name): \n"
        if let inTensorDesc = tensorDesc {
@@ -93,7 +93,7 @@ extension VarDesc: CustomStringConvertible, CustomDebugStringConvertible {
        return str
    }
-    var debugDescription: String {
+  public var debugDescription: String {
        return description
    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Program/framework.pb.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/framework.pb.swift
--- a/metal/paddle-mobile/paddle-mobile/YoloNet.swift
+++ b/metal/paddle-mobile/paddle-mobile/YoloNet.swift
-//
-//  YoloNet.swift
-//  paddle-mobile
-//
-//  Created by Xiao,Haichun on 2018/12/5.
-//  Copyright © 2018 orange. All rights reserved.
-//
-import Foundation
-import Metal
-public class YoloNet: Net {
-    @objc public override init(device: MTLDevice) {
-        super.init(device: device)
-        means = [0, 0, 0]
-        scale = 1
-        except = 0
-        modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null"
-        paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null"
-        modelDir = ""
-        //preprocessKernel = GenetPreProccess.init(device: device)
-        dim = (n: 1, h: 224, w: 224, c: 3)
-    }
-    @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
-        super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
-        means = [0, 0, 0]
-        scale = 1
-        except = 0
-        modelPath = ""
-        paramPath = ""
-        modelDir = ""
-        //preprocessKernel = GenetPreProccess.init(device: device)
-        dim = (n: 1, h: 224, w: 224, c: 3)
-    }
-//    class GenetPreProccess: CusomKernel {
-//        init(device: MTLDevice) {
-//            let s = CusomKernel.Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
-//            super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, usePaddleMobileLib: false)
-//        }
-//    }
-    override  public func resultStr(res: ResultHolder) -> String {
-        //    fatalError()
-        return " \(res.result![0]) ... "
-    }
-}
--- a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
+++ b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
@@ -14,9 +14,6 @@
 #pragma once
-#import "PaddleMobileCPU.h"
-#import "CPUCompute.h"
-#import "PaddleMobileGPU.h"
 #import <UIKit/UIKit.h>
 //! Project version number for paddle_mobile.

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -66,6 +66,7 @@ const char *G_OP_TYPE_CONV_TRANSPOSE = "conv2d_transpose";
 const char *G_OP_TYPE_PRELU = "prelu";
 const char *G_OP_TYPE_LOOKUP_TABLE = "lookup_table";
 const char *G_OP_TYPE_GRU = "gru";
+const char *G_OP_TYPE_GRU_UNIT = "gru_unit";
 const char *G_OP_TYPE_CRF = "crf_decoding";
 const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
 const char *G_OP_TYPE_FLATTEN = "flatten";
@@ -149,6 +150,9 @@ std::unordered_map<
        {G_OP_TYPE_GRU,
         {{"Input", "H0", "Weight", "Bias"},
          {"BatchGate", "BatchResetHiddenPrev", "BatchHidden", "Hidden"}}},
+        {G_OP_TYPE_GRU_UNIT,
+         {{"Input", "HiddenPrev", "Weight", "Bias"},
+          {"Gate", "ResetHiddenPrev", "Hidden"}}},
        {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}},
        {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}},
        {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -110,6 +110,10 @@ enum PoolingType {
  FIRST = 3,
 };
+struct PaddleMobileConfigInternal {
+  bool load_when_predict = false;
+};
 extern const char *G_OP_TYPE_CONV;
 extern const char *G_OP_TYPE_BATCHNORM;
 extern const char *G_OP_TYPE_BOX_CODER;

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -38,12 +38,15 @@ namespace framework {
 #pragma mark - executor
 template <typename Device, typename T>
-Executor<Device, T>::Executor(const Program<Device> &program, int batch_size,
+Executor<Device, T>::Executor(const Program<Device> &program,
-                              const bool use_optimize, const bool lod_mode)
+                              paddle_mobile::PaddleMobileConfigInternal config,
+                              int batch_size, const bool use_optimize,
+                              const bool lod_mode)
    : program_(program),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
-      lod_mode_(lod_mode) {
+      lod_mode_(lod_mode),
+      config_(config) {
  DLOG << "executor in lod mode: " << lod_mode_;
  Variable *variable_ptr = program_.scope->Var("batch_size");
@@ -212,10 +215,17 @@ void Executor<Device, T>::InitCombineMemory() {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
+        DLOG << " init combine memory persistable: " << var_desc->Name();
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
+          DLOG << " init combine memory no persistable in lod: "
+               << var_desc->Name();
          varInputMemory(var_desc, var, tensor);
+        } else {
+          DLOG << " init combine memory no persistable: " << var_desc->Name();
        }
      }
    }
@@ -226,6 +236,34 @@ void Executor<Device, T>::InitCombineMemory() {
  LOG(kLOG_INFO) << "init combine memory finish";
 }
+template <typename Device, typename T>
+void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
+  for (const auto &block : program_desc_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      auto tensor = var->template GetMutable<LoDTensor>();
+      if (var_desc->Persistable()) {
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          continue;
+        }
+      } else {
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
+          DDim tensor_dim = tensor->dims();
+          DDim new_dim =
+              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
+                         input_tensor.dims()[3]});
+          tensor->Resize(new_dim);
+          tensor->template mutable_data<T>();
+        }
+      }
+    }
+  }
+  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
+  output->Resize(input_tensor.dims());
+  output->mutable_data<T>();
+}
 template <typename Device, typename T>
 bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
@@ -297,7 +335,16 @@ void Executor<Device, T>::SetInput(const Tensor &input,
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  if (config_.load_when_predict) {
+    if (input_dim_last_ != input.dims()) {
+      InitNoPersistableMemory(input);
+      input_dim_last_ = input.dims();
+    }
+  }
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
 }
@@ -309,6 +356,14 @@ void Executor<Device, T>::SetInput(const LoDTensor &input,
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  if (config_.load_when_predict) {
+    if (input_dim_last_ != input.dims()) {
+      InitNoPersistableMemory(*target_tensor);
+      input_dim_last_ = input.dims();
+    }
+  }
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
@@ -453,6 +508,70 @@ void Executor<Device, T>::Predict_To(int end) {
 #endif
 #ifdef PADDLE_MOBILE_CL
+template <>
+void Executor<GPU_CL, float>::InitNoPersistableMemory(
+    const Tensor &input_tensor) {
+  DLOG << "CL InitNoPersistableMemory ";
+  for (const auto &block : program_desc_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      auto cl_image = var->template GetMutable<CLImage>();
+      if (var_desc->Persistable()) {
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          continue;
+        }
+      } else {
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
+          cl_context context = program_.scope->GetCLScpoe()->Context();
+          cl_command_queue command_queue =
+              program_.scope->GetCLScpoe()->CommandQueue();
+          DDim tensor_dim = cl_image->dims();
+          DDim new_dim =
+              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
+                         input_tensor.dims()[3]});
+          cl_image->Resize(new_dim);
+          cl_image->InitEmptyImage(context, command_queue, new_dim);
+        }
+      }
+    }
+  }
+  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
+  output->Resize(input_tensor.dims());
+  output->mutable_data<float>();
+}
+template <>
+void Executor<GPU_CL, float>::SetInput(const Tensor &input,
+                                       const std::string &var_name) {
+  auto *target_var = program_.scope->FindVar(var_name);
+  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                        var_name.c_str());
+  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
+  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
+  DLOG << "target_tensor->dims()   " << target_tensor->dims();
+  DLOG << "input.dims()   " << input.dims();
+  DLOG << "input_dim_last_   " << input_dim_last_;
+  if (config_.load_when_predict) {
+    if (input_dim_last_ != input.dims()) {
+      DLOG << "SetInput ---- > resize1";
+      target_tensor->Resize(input.dims());
+      target_tensor->mutable_data<float>();
+      InitNoPersistableMemory(*target_tensor);
+    }
+  } else {
+    DLOG << "SetInput ---- > resize2";
+    target_tensor->Resize(input.dims());
+    DLOG << "SetInput ---- > ShareDataWith";
+  }
+  target_tensor->ShareDataWith(input);
+  auto &dim = input.dims();
+  input_dim_last_ = static_cast<DDim>(dim);
+}
 template <typename Device, typename T>
 void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
@@ -588,6 +707,8 @@ void Executor<GPU_CL, float>::InitMemory() {
 template <>
 void Executor<GPU_CL, float>::InitCombineMemory() {
+  DLOG << "CL InitCombineMemory---- "
+       << "config_.load_when_predict: " << config_.load_when_predict;
  char *origin_data = nullptr;
  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -32,7 +32,8 @@ namespace framework {
 template <typename Device, typename T = float>
 class Executor {
 public:
-  Executor(const Program<Device> &program, int batch_size = 1,
+  Executor(const Program<Device> &program,
+           paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
           const bool use_optimize = true, const bool lod_mode = false);
  PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
@@ -64,6 +65,7 @@ class Executor {
                      LoDTensor *tensor) const;
  void InitMemory();
  void InitCombineMemory();
+  void InitNoPersistableMemory(const Tensor &input_tensor);
  void LoadMemory(void **data, const std::shared_ptr<VarDesc> var_desc,
                  LoDTensor *tensor);
 #ifdef PADDLE_MOBILE_CL
@@ -73,14 +75,17 @@ class Executor {
  int batch_size_;
  bool use_optimize_;
  bool lod_mode_;
+  PaddleMobileConfigInternal config_;
  Program<Device> program_;
  std::shared_ptr<ProgramDesc> program_desc_;
  typedef std::shared_ptr<OperatorBase<Device>> OperatorBasePtr;
  std::vector<std::vector<OperatorBasePtr>> ops_of_block_;
  // operators list
  std::vector<OperatorBasePtr> ops_list_;
+  // for super resoltion
+  DDim input_dim_last_;
 #ifdef PADDLE_MOBILE_PROFILE
  struct ProfInfo {
    int tid = 0;

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -45,8 +45,8 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(dirname, optimize, quantification), batch_size, optimize,
+        loader_->Load(dirname, optimize, quantification), config_, batch_size,
-        lod_mode);
+        optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -67,7 +67,7 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(model_path, para_path, optimize, quantification),
+        loader_->Load(model_path, para_path, optimize, quantification), config_,
        batch_size, optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
@@ -106,7 +106,7 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(
        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                    combined_params_buf, optimize,
                                    quantification),
-        batch_size, optimize, lod_mode);
+        config_, batch_size, optimize, lod_mode);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -33,6 +33,13 @@ namespace paddle_mobile {
 template <typename Device, typename T = float>
 class PaddleMobile {
 public:
+  PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
+#ifndef PADDLE_MOBILE_CL
+    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
+    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
+#endif
+  }
  PaddleMobile() {
 #ifndef PADDLE_MOBILE_CL
    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
@@ -99,6 +106,7 @@ class PaddleMobile {
 private:
  std::shared_ptr<framework::Loader<Device, T>> loader_;
  std::shared_ptr<framework::Executor<Device, T>> executor_;
+  PaddleMobileConfigInternal config_;
 };
 }  // namespace paddle_mobile
--- a/src/operators/gru_unit_op.cpp
+++ b/src/operators/gru_unit_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_UNIT_OP
+#include "operators/gru_unit_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void GruUnitOp<DeviceType, T>::InferShape() const {
+  auto input_dims = this->param_.InputInput()->dims();
+  auto hidden_prev_dims = this->param_.InputHiddenPrev()->dims();
+  auto weight_dims = this->param_.InputWeight()->dims();
+  int batch_size = input_dims[0];
+  int input_size = input_dims[1];
+  int frame_size = hidden_prev_dims[1];
+  int weight_height = weight_dims[0];
+  int weight_width = weight_dims[1];
+  PADDLE_MOBILE_ENFORCE(
+      (input_size == frame_size * 3),
+      "The input_size must be 3 times of frame_size in GRUUnitOp.");
+  PADDLE_MOBILE_ENFORCE(
+      (weight_height == frame_size),
+      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+  PADDLE_MOBILE_ENFORCE(
+      (weight_width == frame_size * 3),
+      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+  if (this->param_.InputBias()) {
+    auto bias_dims = this->param_.InputBias()->dims();
+    int bias_height = bias_dims[0];
+    int bias_width = bias_dims[1];
+    PADDLE_MOBILE_ENFORCE((bias_height == 1),
+                          "The shape of Bias must be [1, frame_size * 3].");
+    PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3),
+                          "The shape of Bias must be [1, frame_size * 3].");
+  }
+  this->param_.OutGate()->Resize({batch_size, frame_size * 3});
+  this->param_.OutResetHiddenPrev()->Resize({batch_size, frame_size});
+  this->param_.OutHidden()->Resize({batch_size, frame_size});
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(gru_unit, ops::GruUnitOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#ifdef PADDLE_MOBILE_CL
+#endif
+#endif
--- a/src/operators/gru_unit_op.h
+++ b/src/operators/gru_unit_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_UNIT_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/kernel/gru_unit_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class GruUnitOp : public framework::OperatorWithKernel<
+                      DeviceType, GruUnitParam<DeviceType>,
+                      operators::GruUnitKernel<DeviceType, T>> {
+ public:
+  GruUnitOp(const std::string &type, const VariableNameMap &inputs,
+            const VariableNameMap &outputs, const AttributeMap &attrs,
+            std::shared_ptr<Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, GruUnitParam<DeviceType>,
+                                      operators::GruUnitKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope){};
+  void InferShape() const override;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/activation_kernel.cpp
+++ b/src/operators/kernel/arm/activation_kernel.cpp
@@ -37,7 +37,7 @@ struct ActivationCompute<float, Act> {
    size_t loop = remain >> 4;
    remain = remain & 0xF;
-    #pragma omp parallel for
+#pragma omp parallel for
    for (size_t i = 0; i < loop; ++i) {
      const float *local_x = x + (i << 4);
      float *local_y = y + (i << 4);

--- a/src/operators/kernel/arm/gru_unit_kernel.cpp
+++ b/src/operators/kernel/arm/gru_unit_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_UNIT_OP
+#include "operators/kernel/gru_unit_kernel.h"
+#include "operators/kernel/central-arm-func/gru_unit_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool GruUnitKernel<CPU, float>::Init(GruUnitParam<CPU> *param) {
+  return true;
+}
+template <>
+void GruUnitKernel<CPU, float>::Compute(const GruUnitParam<CPU> &param) {
+  GruUnitCompute<float>(param);
+}
+template class GruUnitKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+++ b/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_UNIT_OP
+#pragma once
+#include <operators/math/gru_compute.h>
+#include "operators/kernel/activation_kernel.h"
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void GruUnitCompute(const GruUnitParam<CPU>& param) {
+  auto* input = param.InputInput();
+  auto* hidden_prev = param.InputHiddenPrev();
+  auto* weight = param.InputWeight();
+  auto* bias = param.InputBias();
+  auto* gate = param.OutGate();
+  auto* reset_hidden_prev = param.OutResetHiddenPrev();
+  auto* hidden = param.OutHidden();
+  if (bias) {
+    math::RowwiseAdd<CPU, float> add_bias;
+    add_bias(*gate, *bias, gate);
+  }
+  int batch_size = input->dims()[0];
+  int frame_size = hidden_prev->dims()[1];
+  const P* weight_data = weight->data<P>();
+  math::GRUMetaValue<P> gru_value;
+  gru_value.gate_weight = const_cast<P*>(weight_data);
+  gru_value.state_weight =
+      const_cast<P*>(weight_data + 2 * frame_size * frame_size);
+  gru_value.output_value = hidden->data<P>();
+  gru_value.prev_out_value = const_cast<P*>(hidden_prev->data<P>());
+  gru_value.gate_value = gate->data<P>();
+  gru_value.reset_output_value = reset_hidden_prev->data<P>();
+  auto active_node = math::GetActivationType(param.Activation());
+  auto active_gate = math::GetActivationType(param.GateActivation());
+  math::GRUUnitFunctor<CPU, float>::compute(gru_value, frame_size, batch_size,
+                                            active_node, active_gate);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/gru_unit_kernel.h
+++ b/src/operators/kernel/gru_unit_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_UNIT_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class GruUnitKernel
+    : public framework::OpKernelBase<DeviceType, GruUnitParam<DeviceType>> {
+ public:
+  void Compute(const GruUnitParam<DeviceType>& param);
+  bool Init(GruUnitParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/activation.h
+++ b/src/operators/math/activation.h
@@ -45,6 +45,19 @@ inline ActivationType GetActivationType(const std::string &type) {
  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
 }
+inline ActivationType GetActivationType(const int type) {
+  if (type == 0) {
+    return ActivationType::IDENTITY;
+  } else if (type == 1) {
+    return ActivationType::SIGMOID;
+  } else if (type == 2) {
+    return ActivationType::TANH;
+  } else if (type == 3) {
+    return ActivationType::RELU;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
+}
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 template <ActivationType Act = IDENTITY>
 inline float32x4_t vActiveq_f32(const float32x4_t &x) {

--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
@@ -270,7 +270,6 @@ void DepthwiseConv3x3s1p1(const framework::Tensor *input,
  if (if_bias) {
    bias_data = bias->data<float>();
  }
  float32x4_t zero = vdupq_n_f32(0.0);
  for (int b = 0; b < batch_size; ++b) {

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -1188,6 +1188,10 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
  }
 }
+void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu) {}
 #else
 void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -74,6 +74,13 @@ class OpParam {
  static T *InputH0From(const VariableNameMap &inputs, const Scope &scope) {
    return GetVarValue<T>("H0", inputs, scope);
  }
+  template <typename T>
+  static T *InputHiddenPrevFrom(const VariableNameMap &inputs,
+                                const Scope &scope) {
+    return GetVarValue<T>("HiddenPrev", inputs, scope);
+  }
  template <typename T>
  static T *InputAlphaFrom(const VariableNameMap &inputs, const Scope &scope) {
    return GetVarValue<T>("Alpha", inputs, scope);
@@ -214,6 +221,11 @@ class OpParam {
    return GetVarValue<T>("BatchGate", outputs, scope);
  }
+  template <typename T>
+  static T *OutputGateFrom(const VariableNameMap &outputs, const Scope &scope) {
+    return GetVarValue<T>("Gate", outputs, scope);
+  }
  template <typename T>
  static T *OutputViterbiPathFrom(const VariableNameMap &outputs,
                                  const Scope &scope) {
@@ -225,6 +237,12 @@ class OpParam {
    return GetVarValue<T>("BatchResetHiddenPrev", outputs, scope);
  }
+  template <typename T>
+  static T *OutputResetHiddenPrevFrom(const VariableNameMap &outputs,
+                                      const Scope &scope) {
+    return GetVarValue<T>("ResetHiddenPrev", outputs, scope);
+  }
  template <typename T>
  static T *OutputBatchHiddenFrom(const VariableNameMap &outputs,
                                  const Scope &scope) {
@@ -2444,6 +2462,51 @@ class GruParam : public OpParam {
 };
 #endif
+#ifdef GRU_UNIT_OP
+template <typename Dtype>
+class GruUnitParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+ public:
+  GruUnitParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+               const AttributeMap &attrs, const Scope &scope) {
+    input_input_ = InputFrom<GType>(inputs, scope);
+    input_hidden_prev_ = InputHiddenPrevFrom<GType>(inputs, scope);
+    input_bias_ = InputBiasFrom<GType>(inputs, scope);
+    input_weight_ = InputWeightFrom<GType>(inputs, scope);
+    output_gate_ = OutputGateFrom<GType>(outputs, scope);
+    output_reset_hidden_prev_ =
+        OutputResetHiddenPrevFrom<GType>(outputs, scope);
+    output_hidden_ = OutputHiddenFrom<GType>(outputs, scope);
+    activation_ = GetAttr<int>("activation", attrs);
+    gate_activation_ = GetAttr<int>("gate_activation", attrs);
+  }
+  const GType *InputInput() const { return input_input_; }
+  const GType *InputWeight() const { return input_weight_; }
+  const GType *InputHiddenPrev() const { return input_hidden_prev_; }
+  const GType *InputBias() const { return input_bias_; }
+  const int &Activation() const { return activation_; }
+  const int &GateActivation() const { return gate_activation_; }
+  GType *OutGate() const { return output_gate_; }
+  GType *OutResetHiddenPrev() const { return output_reset_hidden_prev_; }
+  GType *OutHidden() const { return output_hidden_; }
+ private:
+  GType *input_input_;
+  GType *input_hidden_prev_;
+  GType *input_bias_;
+  GType *input_weight_;
+  GType *output_gate_;
+  GType *output_reset_hidden_prev_;
+  GType *output_hidden_;
+  int activation_;
+  int gate_activation_;
+};
+#endif
 #ifdef FLATTEN_OP
 template <typename Dtype>
 class FlattenParam : public OpParam {

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -23,10 +23,11 @@ int main() {
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  //  auto program = loader.Load(g_mobilenet_ssd, true);
+  std::string g_super = "../models/superresoltion";
+  //  auto program = loader.Load(g_super, true);
-  //  auto program = loader.Load(std::string(g_ocr) + "/model",
+  auto program = loader.Load(std::string(g_super) + "/model",
-  //                             std::string(g_ocr) + "/params", false);
+                             std::string(g_super) + "/params", false);
  //  program.originProgram->Description("program desc: ");
  return 0;

--- a/test/net/test_super.cpp
+++ b/test/net/test_super.cpp
@@ -18,7 +18,10 @@ limitations under the License. */
 #include "../test_include.h"
 int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  paddle_mobile::PaddleMobileConfigInternal config;
+  config.load_when_predict = true;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
  //    paddle_mobile.SetThreadNum(4);
  auto time1 = paddle_mobile::time();
 #ifdef PADDLE_MOBILE_CL
@@ -27,38 +30,121 @@ int main() {
  auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
                                 std::string(g_super) + "/params", true, false,
-                                 1, true);
+                                 1, false);
  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
  if (isok) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
              << std::endl;
+    // 300*300
+    //    std::vector<float> input;
+    //    std::vector<int64_t> dims{1, 1, 300, 300};
+    //    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    //
+    //    std::vector<float> vec_result;
-    std::vector<float> input;
+    auto time3 = paddle_mobile::time();
-    std::vector<int64_t> dims{1, 1, 300, 300};
+    int max = 1;
-    GetInput<float>(g_yolo_img, &input, dims);
-    std::vector<float> vec_result;
+    //    for (int i = 0; i < max; ++i) {
+    //      auto time5 = paddle_mobile::time();
+    //      vec_result = paddle_mobile.Predict(input, dims);
+    //      auto time6 = paddle_mobile::time();
+    //      std::cout << "300 predict cost :第" << i << ": "
+    //                << paddle_mobile::time_diff(time5, time6) << "ms" <<
+    //                std::endl;
+    //    }
+    //    auto time4 = paddle_mobile::time();
+    //
+    //    std::cout << "300 predict cost :"
+    //              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+    //              << std::endl;
+    //    auto biggest =
+    //        std::max_element(std::begin(vec_result), std::end(vec_result));
+    //    std::cout << "300 Max element is " << *biggest << " at position "
+    //              << std::distance(std::begin(vec_result), biggest) <<
+    //              std::endl;
+    //
+    //    // 500*500
+    //    std::vector<float> vec_result2;
+    //
+    //    std::vector<float> input2;
+    //    std::vector<int64_t> dims2{1, 1, 500, 500};
+    //    GetInput<float>(g_test_image_1x3x224x224, &input2, dims2);
+    //
+    //    time3 = paddle_mobile::time();
+    //    for (int i = 0; i < max; ++i) {
+    //      auto time5 = paddle_mobile::time();
+    //      vec_result2 = paddle_mobile.Predict(input2, dims2);
+    //      auto time6 = paddle_mobile::time();
+    //      std::cout << "500 predict cost :第" << i << ": "
+    //                << paddle_mobile::time_diff(time5, time6) << "ms" <<
+    //                std::endl;
+    //    }
+    //
+    //    time4 = paddle_mobile::time();
+    //    std::cout << "500 predict cost :"
+    //              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+    //              << std::endl;
+    //    biggest = std::max_element(std::begin(vec_result2),
+    //    std::end(vec_result2)); std::cout << "500 Max element is " << *biggest
+    //    << " at position "
+    //              << std::distance(std::begin(vec_result2), biggest) <<
+    //              std::endl;
+    //
+    //    // 1000*1000
+    //
+    //    std::vector<float> vec_result3;
+    //    std::vector<float> input3;
+    //    std::vector<int64_t> dims3{1, 1, 1000, 1000};
+    //    GetInput<float>(g_test_image_1x3x224x224, &input3, dims3);
+    //
+    //    time3 = paddle_mobile::time();
+    //
+    //    for (int i = 0; i < max; ++i) {
+    //      auto time5 = paddle_mobile::time();
+    //      vec_result3 = paddle_mobile.Predict(input3, dims3);
+    //      auto time6 = paddle_mobile::time();
+    //      std::cout << "1000*1000 predict cost :第" << i << ": "
+    //                << paddle_mobile::time_diff(time5, time6) << "ms" <<
+    //                std::endl;
+    //    }
+    //    time4 = paddle_mobile::time();
+    //    std::cout << "1000*1000 predict cost :"
+    //              << paddle_mobile::time_diff(time3, time4) / max << "ms"
+    //              << std::endl;
+    //    biggest = std::max_element(std::begin(vec_result3),
+    //    std::end(vec_result3)); std::cout << "1000*1000 Max element is " <<
+    //    *biggest << " at position "
+    //              << std::distance(std::begin(vec_result3), biggest) <<
+    //              std::endl;
-    auto time3 = paddle_mobile::time();
+    // 224*224
-    int max = 10;
+    std::vector<float> vec_result4;
+    std::vector<float> input4;
+    std::vector<int64_t> dims4{1, 1, 300, 300};
+    GetInput<float>(g_test_image_1x3x224x224, &input4, dims4);
+    time3 = paddle_mobile::time();
    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile.Predict(input, dims);
+      auto time5 = paddle_mobile::time();
+      vec_result4 = paddle_mobile.Predict(input4, dims4);
+      auto time6 = paddle_mobile::time();
+      std::cout << "224*224 predict cost :第" << i << ": "
+                << paddle_mobile::time_diff(time5, time6) << "ms" << std::endl;
    }
-    auto time4 = paddle_mobile::time();
-    std::cout << "predict cost :"
+    auto time4 = paddle_mobile::time();
+    std::cout << "224*224 predict cost :"
              << paddle_mobile::time_diff(time3, time4) / max << "ms"
              << std::endl;
-    std::vector<float>::iterator biggest =
+    //    biggest = std::max_element(std::begin(vec_result4),
-        std::max_element(std::begin(vec_result), std::end(vec_result));
+    //    std::end(vec_result4)); std::cout << "224*224 Max element is " <<
-    std::cout << " Max element is " << *biggest << " at position "
+    //    *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
+    //              << std::distance(std::begin(vec_result4), biggest) <<
+    //              std::endl;
  }
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
  return 0;
 }
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -63,6 +63,7 @@ static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
 static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
 static const char *g_img = "../images/img.bin";
 static const char *g_yolo_img = "../images/in_put_1_3_416_416_2";
+static const char *g_super_img = "../images/mingren_input_data";
 static const char *g_mobilenet_img = "../images/image";
 using paddle_mobile::framework::DDim;

--- a/tools/ci_build.sh
+++ b/tools/ci_build.sh
+#!/usr/bin/env bash
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+function print_usage() {
+  echo "\n${RED}Usage${NONE}:
+  ${BOLD}${SCRIPT_NAME}${NONE} [Option] [Network]"
+  echo "\n${RED}Option${NONE}: required, specify the target platform
+  ${BLUE}android_armv7${NONE}: run build for android armv7 platform
+  ${BLUE}android_armv8${NONE}: run build for android armv8 platform
+  ${BLUE}ios${NONE}: run build for apple ios platform
+  ${BLUE}linux_armv7${NONE}: run build for linux armv7 platform
+  ${BLUE}linux_armv8${NONE}: run build for linux armv8 platform
+  "
+  echo "\n${RED}Network${NONE}: optional, for deep compressing the framework size
+  ${BLUE}googlenet${NONE}: build only googlenet support
+  ${BLUE}mobilenet${NONE}: build only mobilenet support
+  ${BLUE}yolo${NONE}: build only yolo support
+  ${BLUE}squeezenet${NONE}: build only squeezenet support
+  ${BLUE}resnet${NONE}: build only resnet support
+  ${BLUE}mobilenetssd${NONE}: build only mobilenetssd support
+  ${BLUE}nlp${NONE}: build only nlp model support
+  ${BLUE}mobilenetfssd${NONE}: build only mobilenetfssd support
+  ${BLUE}genet${NONE}: build only genet support
+  ${BLUE}super${NONE}: build only super support
+  "
+}
+function init() {
+  RED='\033[0;31m'
+  BLUE='\033[0;34m'
+  BOLD='\033[1m'
+  NONE='\033[0m'
+  PADDLE_MOBILE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+  if [ -z "${SCRIPT_NAME}" ]; then
+      SCRIPT_NAME=$0
+  fi
+}
+function check_ndk() {
+  if [ -z "${NDK_ROOT}" ]; then
+    echo "Should set NDK_ROOT as your android ndk path, such as\n"
+    echo "  export NDK_ROOT=~/android-ndk-r14b\n"
+    exit -1
+  fi
+}
+function build_android_armv7_cpu_only() {
+  rm -rf ../build/armeabi-v7a
+  cmake .. \
+    -B"../build/armeabi-v7a" \
+    -DANDROID_ABI="armeabi-v7a with NEON" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
+    -DANDROID_PLATFORM="android-22" \
+    -DANDROID_STL=c++_static \
+    -DANDROID=true \
+    -DWITH_LOGGING=OFF \
+    -DGPU_MALI=OFF \
+    -DGPU_CL=OFF \
+    -DFPGA=OFF
+  cd ../build/armeabi-v7a && make -j 8
+  cd -
+}
+function build_android_armv7_gpu() {
+  rm -rf ../build/armeabi-v7a
+  cmake .. \
+    -B"../build/armeabi-v7a" \
+    -DANDROID_ABI="armeabi-v7a with NEON" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
+    -DANDROID_PLATFORM="android-22" \
+    -DANDROID_STL=c++_static \
+    -DANDROID=true \
+    -DWITH_LOGGING=OFF \
+    -DGPU_MALI=ON \
+    -DGPU_CL=ON \
+    -DFPGA=OFF
+  cd ../build/armeabi-v7a && make -j 8
+  cd -
+}
+function build_android_armv8_cpu_only() {
+  rm -rf ../build/arm64-v8a
+  cmake .. \
+    -B"../build/arm64-v8a" \
+    -DANDROID_ABI="arm64-v8a" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
+    -DANDROID_PLATFORM="android-22" \
+    -DANDROID_STL=c++_static \
+    -DANDROID=true \
+    -DWITH_LOGGING=OFF \
+    -DGPU_MALI=OFF \
+    -DGPU_CL=OFF \
+    -DFPGA=OFF
+  cd ../build/arm64-v8a && make -j 1
+  cd -
+}
+function build_android_armv8_gpu() {
+  rm -rf ../build/arm64-v8a
+  cmake .. \
+    -B"../build/arm64-v8a" \
+    -DANDROID_ABI="arm64-v8a" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
+    -DANDROID_PLATFORM="android-22" \
+    -DANDROID_STL=c++_static \
+    -DANDROID=true \
+    -DWITH_LOGGING=OFF \
+    -DGPU_MALI=ON \
+    -DGPU_CL=ON \
+    -DFPGA=OFF
+  cd ../build/arm64-v8a && make -j 8
+  cd -
+}
+function build_ios_armv8_cpu_only() {
+  rm -rf ../build/ios
+  cmake .. \
+    -B"../build/ios" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \
+    -DIOS_PLATFORM=OS \
+    -DIOS_ARCH="${IOS_ARCH}" \
+    -DIS_IOS=true \
+    -DGPU_MALI=OFF \
+    -DGPU_CL=OFF \
+    -DFPGA=OFF
+  cd ../build/ios && make -j 8
+  cd -
+}
+function build_ios_armv8_gpu() {
+  rm -rf ../build/ios
+  cmake .. \
+    -B"../build/ios" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \
+    -DIOS_PLATFORM=OS \
+    -DIOS_ARCH="${IOS_ARCH}" \
+    -DIS_IOS=true \
+    -DGPU_MALI=OFF \
+    -DGPU_CL=ON \
+    -DFPGA=OFF
+  cd ../build/ios && make -j 8
+  cd -
+}
+function build_linux_armv7_cpu_only() {
+  rm -rf ../build/armv7_linux
+  cmake .. \
+    -B"../build/armv7_linux" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
+    -DGPU_MALI=OFF \
+    -DGPU_CL=OFF \
+    -DFPGA=OFF
+  cd ../build/armv7_linux && make -j 8
+  cd -
+}
+function build_linux_armv7_gpu() {
+  rm -rf ../build/armv7_linux
+  cmake .. \
+    -B"../build/armv7_linux" \
+    -DCMAKE_BUILD_TYPE="MinSizeRel" \
+    -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
+    -DGPU_MALI=ON \
+    -DGPU_CL=ON \
+    -DFPGA=OFF
+  cd ../build/armv7_linux && make -j 8
+  cd -
+}
+function build_android_armv7() {
+  check_ndk
+  build_android_armv7_cpu_only
+  # build_android_armv7_gpu
+}
+function build_android_armv8() {
+  check_ndk
+  build_android_armv8_cpu_only
+  # build_android_armv8_gpu
+}
+function build_ios() {
+  build_ios_armv8_cpu_only
+  # build_ios_armv8_gpu
+}
+function build_linux_armv7() {
+  check_ndk
+  build_linux_armv7_cpu_only
+  # build_linux_armv7_gpu
+}
+function main() {
+  local CMD=$1
+  init
+  case $CMD in
+    android_armv7)
+      build_android_armv7
+      ;;
+    android_armv8)
+      build_android_armv8
+      ;;
+    ios)
+      build_ios
+      ;;
+    linux_armv7)
+      build_linux_armv7
+      ;;
+    *)
+      print_usage
+      exit 0
+      ;;
+    esac
+}
+main $@
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -256,6 +256,7 @@ if(NOT FOUND_MATCH)
  set(IM2SEQUENCE_OP ON)
  set(LOOKUP_OP ON)
  set(GRU_OP ON)
+  set(GRU_UNIT_OP ON)
  set(CRF_OP ON)
  set(BILINEAR_INTERP_OP ON)
  set(SPLIT_OP ON)
@@ -450,6 +451,10 @@ if (GRU_OP)
  add_definitions(-DGRU_OP)
 endif()
+if (GRU_UNIT_OP)
+  add_definitions(-DGRU_UNIT_OP)
+endif()
 if (CRF_OP)
  add_definitions(-DCRF_OP)
 endif()

--- a/tools/python/modeltools/tools/model_combine.py
+++ b/tools/python/modeltools/tools/model_combine.py
 # coding=utf-8
 import os
-path = "yolo_v2_tofile_source/"  # 文件夹目录
+path = "mobilenet/"  # 文件夹目录
-to_file_path = "yolo_v2_tofile_combined/params"
+to_file_path = "mobilenet_combine/params"
 files = os.listdir(path)  # 得到文件夹下的所有文件名称
 files.sort(cmp=None, key=str.lower)
 to_file = open(to_file_path, "wb")
 for file in files:  # 遍历文件夹
-    if not os.path.isdir(file):  # 判断是否是文件夹，不是文件夹才打开
+    if not os.path.isdir(file) and file != ".DS_Store":  # 判断是否是文件夹，不是文件夹才打开
        f = open(path + "/" + file)  # 打开文件
        name = f.name
        print 'name:  ' + name