Merge remote-tracking branch 'upstream/develop' into develop

708d5704 · dolphin8 · 842c8822 · 6562c696 · 708d5704 · 708d5704
184 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,10 @@ build
 # clion building directories
 cmake-build-debug
 cmake-build-release
+
+
+#ios demo
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
+*.xcuserstate
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,29 +2,64 @@ cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)

 option(DEBUGING "enable debug mode" ON)
-option(USE_OPENMP "openmp support" OFF)
+option(USE_OPENMP "openmp support" ON)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
-set(DEBUGING ON)

-if (ARM_LINUX)
-include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
+file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
+include_directories(src/)
+
+set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
+if (DEBUGING)
+    message(STATUS "debug")
+    set(CMAKE_BUILD_TYPE Debug)
+    set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
+    add_definitions(-DPADDLE_MOBILE_DEBUG)
+else ()
+    set(CMAKE_BUILD_TYPE Release)
+    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
 endif ()

-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
-file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
+if (USE_EXCEPTION)
+    message(STATUS "use exception")
+    add_definitions(-DENABLE_EXCEPTION)
+    add_definitions(-fexceptions)
+else()
+    add_definitions(-fno-exceptions)
+endif ()
+
+if (LOG_PROFILE)
+    add_definitions(-DPADDLE_MOBILE_PROFILE)
+endif()
+
+if(USE_OPENMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+endif()
+
+# platform control
+if (ARM_LINUX)
+    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
+endif ()

 if (CPU)
  add_definitions(-DPADDLE_MOBILE_CPU)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.cc)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.cpp)
-
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()

 if (MALI_GPU)
@@ -41,64 +76,48 @@ if (MALI_GPU)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cc)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cpp)
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()

 if(FPGA)
    add_definitions(-DPADDLE_MOBILE_FPGA)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cc)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cpp)
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()


-set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
-if (DEBUGING)
-  message(STATUS "debug")
-  set(CMAKE_BUILD_TYPE Debug)
-  set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
-  add_definitions(-DPADDLE_MOBILE_DEBUG)
-  if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
-    add_definitions(-DARMV7)
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-  endif ()
-else ()
-  set(CMAKE_BUILD_TYPE Release)
-  set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
-  add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif ()
-
-if (USE_EXCEPTION)
-    message(STATUS "use exception")
-    add_definitions(-DENABLE_EXCEPTION)
-    add_definitions(-fexceptions)
+    add_definitions(-DARMV7)
 else()
-    add_definitions(-fno-exceptions)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
 endif ()

-if (LOG_PROFILE)
-    add_definitions(-DPADDLE_MOBILE_PROFILE)
-endif()
-
-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
-endif()
-
-
-
-
-if (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)
-  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
-  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
-  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+if (IS_IOS)
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
 endif ()

-include_directories(src/)
-
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
@@ -108,19 +127,24 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 # NET default
 set(NET "defult" CACHE STRING "select net type")
 set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
-
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")

-# if (IS_IOS)
-#     add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+
+# build library
 if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+elseif(IS_IOS)
+    add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 else ()
    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 endif ()

+# unit test
 if(DEBUGING)
-    add_subdirectory(test)
+    if(IS_IOS)
+    else()
+        add_subdirectory(test)
+    endif()
 endif()

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -183,7 +183,6 @@ upstream

 接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。

-
 ![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
 之后就可以提交代码了

@@ -223,7 +222,6 @@ upstream
     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-
 3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。

 此外，在回复评审人意见时，请您遵守以下约定：

--- a/README.md
+++ b/README.md
@@ -8,22 +8,33 @@
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->


-
 欢迎来到 Paddle-Mobile GitHub 项目。

 Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Moible设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。

+## 简单搜索线上效果
+
+如下gif是简单搜索app的线上主体检测应用效果
+
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
+
+## Demo目录
+
+[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+
 ## Features

 - **ARM CPU**

+
+![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
+
    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
-    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile现在单核运行一次mobilenet 1.0是160+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间。
    
 - **Mali GPU**

    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
-    在

 - **苹果设备的GPU Metal实现**

@@ -45,7 +56,7 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **体积**

    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
-    除了二进制体积，我们对代码体积极力避免过大。整个仓库不到5m的代码体积。
+    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。


 ## 文档
@@ -58,7 +69,7 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ### 开发文档

 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
-[开发文档]()https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md
+[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)

 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
@@ -78,17 +89,10 @@ ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切

 目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。

-```flow
-st=>start: 其他模型
-op1=>operation: onnx模型
-op2=>operation: paddle-onnx
-op3=>operation: paddle fluid模型
-e=>end: paddle-mobile运行
-st->op1->op2->op3->e
-```
-
-### 4. 部分测试模型下载
-[下载链接](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
+![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
+
+### 4. 部分测试模型和测试图片下载
+[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)

 ## 问题解决


--- a/demo/android/PaddleMobile_Android/.gitignore
+++ b/demo/android/PaddleMobile_Android/.gitignore
+*.iml
+.gradle
+/local.properties
+/.idea/workspace.xml
+/.idea/libraries
+.DS_Store
+/build
+/captures
+.externalNativeBuild
--- a/demo/android/PaddleMobile_Android/app/.gitignore
+++ b/demo/android/PaddleMobile_Android/app/.gitignore
+
--- a/demo/android/PaddleMobile_Android/app/build.gradle
+++ b/demo/android/PaddleMobile_Android/app/build.gradle
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 21
+    defaultConfig {
+        applicationId "com.baidu.paddle"
+        minSdkVersion 15
+        targetSdkVersion 21
+        versionCode 1
+        versionName "1.0"
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+}
+
+dependencies {
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:21.0.3'
+}
--- a/demo/android/PaddleMobile_Android/app/proguard-rules.pro
+++ b/demo/android/PaddleMobile_Android/app/proguard-rules.pro
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
--- a/demo/android/PaddleMobile_Android/app/src/main/AndroidManifest.xml
+++ b/demo/android/PaddleMobile_Android/app/src/main/AndroidManifest.xml
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.baidu.paddle">
+
+    <!-- 往SDCard写入数据权限 -->
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
+    <uses-permission android:name="android.permission.CAMERA" />
+
+    <application
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:supportsRtl="true"
+        android:theme="@style/AppTheme" >
+    <activity
+        android:name=".MainActivity"
+        android:label="@string/app_name">
+        <intent-filter>
+            <action android:name="android.intent.action.MAIN" />
+
+            <category android:name="android.intent.category.LAUNCHER" />
+        </intent-filter>
+    </activity>
+    </application>
+</manifest>
--- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
+++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
+/*
+ * Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+ * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+ * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+ * the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+ * THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+package com.baidu.paddle;
+
+import android.app.Activity;
+import android.content.Context;
+import android.content.Intent;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Paint;
+import android.net.Uri;
+import android.os.AsyncTask;
+import android.os.Bundle;
+import android.os.Environment;
+
+import android.provider.MediaStore;
+
+import android.util.Log;
+import android.view.View;
+import android.widget.Button;
+import android.widget.ImageView;
+import android.widget.TextView;
+import android.widget.Toast;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+
+import static android.graphics.Color.blue;
+import static android.graphics.Color.green;
+import static android.graphics.Color.red;
+import static com.baidu.paddle.MainActivity.TYPE.googlenet;
+
+
+public class MainActivity extends Activity {
+    public static final int TAKE_PHOTO_REQUEST_CODE = 1001;
+
+    private Context mContext = null;
+
+    private int inputSize = 224;
+    private int[] ddims = {1, 3, 224, 224};
+
+    enum TYPE {
+        googlenet
+    }
+
+    private TYPE type = googlenet;
+    private ImageView imageView;
+    private TextView tvSpeed;
+    private Button button;
+    private Bitmap bmp;
+
+    static {
+        try {
+            System.loadLibrary("paddle-mobile");
+
+        } catch (SecurityException e) {
+            e.printStackTrace();
+
+        } catch (UnsatisfiedLinkError e) {
+            e.printStackTrace();
+
+        } catch (NullPointerException e) {
+            e.printStackTrace();
+
+        }
+
+    }
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        mContext = this;
+        setContentView(R.layout.main_activity);
+        init();
+    }
+
+    private void init() {
+        imageView = (ImageView) findViewById(R.id.imageView);
+        tvSpeed = (TextView) findViewById(R.id.tv_speed);
+        button = (Button) findViewById(R.id.button);
+        button.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                if (!isHasSdCard()) {
+                    Toast.makeText(mContext, R.string.sdcard_not_available,
+                            Toast.LENGTH_LONG).show();
+                    return;
+                }
+                Intent intent = new Intent(MediaStore.ACTION_IMAGE_CAPTURE);
+                // save pic in sdcard
+                Uri imageUri = Uri.fromFile(getTempImage());
+                intent.putExtra(MediaStore.EXTRA_OUTPUT, imageUri);
+                startActivityForResult(intent, TAKE_PHOTO_REQUEST_CODE);
+
+            }
+        });
+        Button bt_load = (Button) findViewById(R.id.bt_load);
+        bt_load.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+                String assetPath = "pml_demo";
+                String sdcardPath = Environment.getExternalStorageDirectory()
+                        + File.separator + assetPath + File.separator + type;
+                PML.load(sdcardPath);
+                String modelPath = Environment.getExternalStorageDirectory()
+                                                           + File.separator + assetPath +
+                                                           File.separator + "googlenet_combine" + File.separator + "model";
+                String paramPath = Environment.getExternalStorageDirectory()
+                                                           + File.separator + assetPath +
+                                                           File.separator + "googlenet_combine" + File.separator + "params";
+//                PML.loadCombined(modelPath, paramPath);
+
+            }
+        });
+        Button bt_clear = (Button) findViewById(R.id.bt_clear);
+        bt_clear.setOnClickListener(new View.OnClickListener() {
+            @Override
+            public void onClick(View view) {
+
+                    PML.clear();
+
+
+            }
+        });
+        String assetPath = "pml_demo";
+        String sdcardPath = Environment.getExternalStorageDirectory()
+                + File.separator + assetPath;
+        copyFilesFromAssets(this, assetPath, sdcardPath);
+
+
+    }
+
+    public void copyFilesFromAssets(Context context, String oldPath, String newPath) {
+        try {
+            String[] fileNames = context.getAssets().list(oldPath);
+            if (fileNames.length > 0) {
+                // directory
+                File file = new File(newPath);
+                file.mkdirs();
+                // copy recursivelyC
+                for (String fileName : fileNames) {
+                    copyFilesFromAssets(context, oldPath + "/" + fileName,
+                            newPath + "/" + fileName);
+                }
+            } else {
+                // file
+                InputStream is = context.getAssets().open(oldPath);
+                FileOutputStream fos = new FileOutputStream(new File(newPath));
+                byte[] buffer = new byte[1024];
+                int byteCount;
+                while ((byteCount = is.read(buffer)) != -1) {
+                    fos.write(buffer, 0, byteCount);
+                }
+                fos.flush();
+                is.close();
+                fos.close();
+            }
+        } catch (Exception e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+    }
+
+    public File getTempImage() {
+        if (Environment.getExternalStorageState().equals(
+                Environment.MEDIA_MOUNTED)) {
+            File tempFile = new File(Environment.getExternalStorageDirectory(), "temp.jpg");
+            try {
+                tempFile.createNewFile();
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
+
+            return tempFile;
+        }
+        return null;
+    }
+
+    public void onActivityResult(int requestCode, int resultCode, Intent data) {
+        super.onActivityResult(requestCode, resultCode, data);
+        switch (requestCode) {
+            case TAKE_PHOTO_REQUEST_CODE:
+                if (resultCode == RESULT_OK) {
+                    DetectionTask detectionTask = new DetectionTask();
+                    detectionTask.execute(getTempImage().getPath());
+                }
+                break;
+            default:
+                break;
+        }
+    }
+
+    /**
+     * draw rect on imageView
+     *
+     * @param bitmap
+     * @param predicted
+     * @param viewWidth
+     * @param viewHeight
+     */
+    private void drawRect(Bitmap bitmap, float[] predicted, int viewWidth, int viewHeight) {
+
+        Canvas canvas = new Canvas(bitmap);
+        canvas.drawBitmap(bitmap, 0, 0, null);
+        if (type == googlenet) {
+            Paint paint = new Paint();
+            paint.setColor(Color.RED);
+            paint.setStyle(Paint.Style.STROKE);
+            paint.setStrokeWidth(3.0f);
+            float x1 = 0;
+            float x2 = 0;
+            float y1 = 0;
+            float y2 = 0;
+
+            // the googlenet result sequence is (left top right top bottom)
+            x1 = (predicted[0] * viewWidth / 224);
+            y1 = (predicted[1] * viewHeight / 224);
+            x2 = (predicted[2] * viewWidth / 224);
+            y2 = (predicted[3] * viewHeight / 224);
+
+
+            canvas.drawRect(x1, y1, x2, y2, paint);
+        }
+
+
+        imageView.setImageBitmap(bitmap);
+
+    }
+
+    float getMaxIndex(float[] predicted) {
+        float max = 0;
+        int index = 0;
+        for (int i = 0; i < predicted.length; i++) {
+            if (predicted[i] > max) {
+                max = predicted[i];
+                index = i;
+            }
+        }
+        return index;
+    }
+
+    public float[] getScaledMatrix(Bitmap bitmap, int desWidth,
+                                   int desHeight) {
+        float[] dataBuf = new float[3 * desWidth * desHeight];
+        int rIndex;
+        int gIndex;
+        int bIndex;
+        int[] pixels = new int[desWidth * desHeight];
+        Bitmap bm = Bitmap.createScaledBitmap(bitmap, desWidth, desHeight, false);
+        bm.getPixels(pixels, 0, desWidth, 0, 0, desWidth, desHeight);
+        int j = 0;
+        int k = 0;
+        for (int i = 0; i < pixels.length; i ++) {
+            int clr = pixels[i];
+            j = i / desHeight;
+            k = i % desWidth;
+            rIndex = j * desWidth + k;
+            gIndex = rIndex  + desHeight * desWidth;
+            bIndex = gIndex + desHeight * desWidth;
+            dataBuf[rIndex] = (float)((clr & 0x00ff0000)>> 16) -148;
+            dataBuf[gIndex] = (float)((clr & 0x0000ff00)>> 8) - 148;
+            dataBuf[bIndex] = (float)((clr & 0x000000ff)) -148;
+
+        }
+        if (bm.isRecycled()) {
+            bm.recycle();
+        }
+        return dataBuf;
+
+
+    }
+    /**
+     * check whether sdcard is mounted
+     *
+     * @return
+     */
+    public boolean isHasSdCard() {
+        if (Environment.getExternalStorageState().equals(
+                Environment.MEDIA_MOUNTED)) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    public void dumpData(float[] results, String filename) {
+        try {
+            File writename = new File(filename);
+            writename.createNewFile();
+            BufferedWriter out = new BufferedWriter(new FileWriter(writename));
+            for (float result : results) {
+                out.write(result + " ");
+            }
+            out.flush();
+            out.close();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
+
+    /**
+     * scale bitmap in case of OOM
+     *
+     * @param ctx
+     * @param filePath
+     * @return
+     */
+    public Bitmap getScaleBitmap(Context ctx, String filePath) {
+        BitmapFactory.Options opt = new BitmapFactory.Options();
+        opt.inJustDecodeBounds = true;
+        BitmapFactory.decodeFile(filePath, opt);
+
+        int bmpWidth = opt.outWidth;
+        int bmpHeight = opt.outHeight;
+
+        int maxSize = 500;
+
+        opt.inSampleSize = 1;
+        while (true) {
+            if (bmpWidth / opt.inSampleSize < maxSize || bmpHeight / opt.inSampleSize < maxSize) {
+                break;
+            }
+            opt.inSampleSize *= 2;
+        }
+        opt.inJustDecodeBounds = false;
+        Bitmap bmp = BitmapFactory.decodeFile(filePath, opt);
+        return bmp;
+    }
+
+    @Override
+    public void onBackPressed() {
+        super.onBackPressed();
+            Log.d("pml", "pml clear");
+            // clear pml
+            PML.clear();
+
+    }
+
+    class DetectionTask extends AsyncTask<String, Void, float[]> {
+        private long time;
+
+        public DetectionTask() {
+            super();
+        }
+
+        @Override
+        protected void onPreExecute() {
+            super.onPreExecute();
+            if (type == googlenet) {
+                inputSize = 224;
+            }
+        }
+
+        @Override
+        protected void onPostExecute(float[] result) {
+            super.onPostExecute(result);
+            try {
+                Bitmap src = Bitmap.createScaledBitmap(bmp, imageView.getWidth(),
+                        imageView.getHeight(), false);
+                drawRect(src, result, imageView.getWidth(), imageView.getHeight());
+                tvSpeed.setText("detection cost：" + time + "ms");
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        }
+
+        @Override
+        protected void onProgressUpdate(Void... values) {
+            super.onProgressUpdate(values);
+        }
+
+        @Override
+        protected void onCancelled() {
+            super.onCancelled();
+        }
+
+        @Override
+        protected float[] doInBackground(String... strings) {
+            bmp = getScaleBitmap(mContext, strings[0]);
+            float[] inputData = getScaledMatrix(bmp, inputSize, inputSize);
+            float[] result = null;
+            try {
+                long start = System.currentTimeMillis();
+                result = PML.predictImage(inputData, ddims);
+                long end = System.currentTimeMillis();
+                time = end - start;
+
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+            return result;
+        }
+    }
+}
--- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
+++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
+package com.baidu.paddle;
+
+public class PML {
+    /**
+     * Load seperated parameters
+     * @param modelDir
+     * @return
+     */
+    public static native boolean load(String modelDir);
+
+    /**
+     * Load combined parameters
+     * @param modelPath
+     * @param paramPath
+     * @return
+     */
+    public static native boolean loadCombined(String modelPath,String paramPath);
+
+
+    /**
+     * object detection
+     *
+     * @param buf
+     * @return
+     */
+    public static native float[] predictImage(float[] buf, int[]ddims);
+
+    /**
+     *
+     * @param buf yuv420格式的字节数组
+     * @param imgWidth yuv数据的宽
+     * @param imgHeight yuv数据的高
+     * @param ddims 输入数据的形状
+     * @param meanValues 模型训练时各通道的均值
+     * @return
+     */
+
+    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues);
+
+
+
+    public static native void clear();
+
+}
--- a/demo/android/PaddleMobile_Android/app/src/main/res/drawable/ic_launcher_background.xml
+++ b/demo/android/PaddleMobile_Android/app/src/main/res/drawable/ic_launcher_background.xml
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportHeight="108"
+    android:viewportWidth="108">
+    <path
+        android:fillColor="#26A69A"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeColor="#33FFFFFF"
+        android:strokeWidth="0.8" />
+</vector>
--- a/demo/android/PaddleMobile_Android/app/src/main/res/layout/main_activity.xml
+++ b/demo/android/PaddleMobile_Android/app/src/main/res/layout/main_activity.xml
+<?xml version="1.0" encoding="utf-8"?>
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:orientation="vertical" android:layout_width="match_parent"
+    android:layout_height="match_parent">
+    <LinearLayout
+        android:id="@+id/ll_bottom"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:orientation="horizontal"
+        android:layout_alignParentBottom="true"
+        android:background="@android:color/background_light">
+        <TextView
+            android:id="@+id/tv_speed"
+            android:layout_width="match_parent"
+            android:layout_weight="1"
+            android:layout_height="wrap_content"
+            android:textColor="@android:color/background_dark"
+            android:text="@string/time_cost"/>
+    <Button
+        android:id="@+id/button"
+        android:layout_width="match_parent"
+        android:layout_height="50dp"
+        android:text="take photo"
+        android:layout_weight="1"
+        />
+    </LinearLayout>
+    <LinearLayout
+        android:id="@+id/ll_test"
+        android:layout_above="@id/ll_bottom"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:orientation="horizontal">
+        <Button
+            android:id="@+id/bt_load"
+            android:layout_width="0dp"
+            android:layout_height="wrap_content"
+            android:layout_weight="1"
+            android:text="load"/>
+        <Button
+            android:layout_width="0dp"
+            android:layout_height="wrap_content"
+            android:id="@+id/bt_clear"
+            android:layout_weight="1"
+            android:text="clear"/>
+
+    </LinearLayout>
+    <ImageView
+        android:id="@+id/imageView"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:scaleType="centerInside"
+        android:layout_above="@id/ll_test"
+        android:background="@android:color/background_light"/>
+
+</RelativeLayout>
\ No newline at end of file
--- a/demo/android/PaddleMobile_Android/app/src/main/res/mipmap-hdpi/ic_launcher.png
+++ b/demo/android/PaddleMobile_Android/app/src/main/res/mipmap-hdpi/ic_launcher.png
--- a/demo/android/PaddleMobile_Android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
+++ b/demo/android/PaddleMobile_Android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
--- a/demo/android/PaddleMobile_Android/app/src/main/res/values/colors.xml
+++ b/demo/android/PaddleMobile_Android/app/src/main/res/values/colors.xml
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="colorPrimary">#3F51B5</color>
+    <color name="colorPrimaryDark">#303F9F</color>
+    <color name="colorAccent">#FF4081</color>
+</resources>
--- a/demo/android/PaddleMobile_Android/app/src/main/res/values/strings.xml
+++ b/demo/android/PaddleMobile_Android/app/src/main/res/values/strings.xml
+<resources>
+    <string name="app_name">PaddleMobile_Android</string>
+    <string name="sdcard_not_available">sdcard not available</string>
+    <string name="time_cost">detection cost:</string>
+</resources>
--- a/demo/android/PaddleMobile_Android/app/src/main/res/values/styles.xml
+++ b/demo/android/PaddleMobile_Android/app/src/main/res/values/styles.xml
+<resources>
+
+    <!-- Base application theme. -->
+    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
+        <!-- Customize your theme here. -->
+        <item name="colorPrimary">@color/colorPrimary</item>
+        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
+        <item name="colorAccent">@color/colorAccent</item>
+    </style>
+
+</resources>
--- a/demo/android/PaddleMobile_Android/build.gradle
+++ b/demo/android/PaddleMobile_Android/build.gradle
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+    
+    repositories {
+        google()
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:3.1.3'
+        
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        google()
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
--- a/demo/android/PaddleMobile_Android/gradle.properties
+++ b/demo/android/PaddleMobile_Android/gradle.properties
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx1536m
+android.injected.testOnly=false
+
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
--- a/demo/android/PaddleMobile_Android/gradle/wrapper/gradle-wrapper.jar
+++ b/demo/android/PaddleMobile_Android/gradle/wrapper/gradle-wrapper.jar
--- a/demo/android/PaddleMobile_Android/gradle/wrapper/gradle-wrapper.properties
+++ b/demo/android/PaddleMobile_Android/gradle/wrapper/gradle-wrapper.properties
+#Mon Jul 02 13:58:58 CST 2018
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
--- a/demo/android/PaddleMobile_Android/gradlew
+++ b/demo/android/PaddleMobile_Android/gradlew
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
--- a/demo/android/PaddleMobile_Android/gradlew.bat
+++ b/demo/android/PaddleMobile_Android/gradlew.bat
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
--- a/demo/android/PaddleMobile_Android/settings.gradle
+++ b/demo/android/PaddleMobile_Android/settings.gradle
+include ':app'
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
@@ -7,80 +7,123 @@
 	objects = {

 /* Begin PBXBuildFile section */
-		FC086BC920E783AF00D85EF7 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BC820E783AF00D85EF7 /* AppDelegate.m */; };
-		FC086BCC20E783AF00D85EF7 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BCB20E783AF00D85EF7 /* ViewController.m */; };
-		FC086BCF20E783AF00D85EF7 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC086BCD20E783AF00D85EF7 /* Main.storyboard */; };
-		FC086BD120E783B100D85EF7 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC086BD020E783B100D85EF7 /* Assets.xcassets */; };
-		FC086BD420E783B100D85EF7 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */; };
-		FC086BD720E783B100D85EF7 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BD620E783B100D85EF7 /* main.m */; };
+		FC12E93320EB6B2800807EF4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = FC12E93220EB6B2800807EF4 /* AppDelegate.m */; };
+		FC12E93620EB6B2800807EF4 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = FC12E93520EB6B2800807EF4 /* ViewController.m */; };
+		FC12E93920EB6B2800807EF4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC12E93720EB6B2800807EF4 /* Main.storyboard */; };
+		FC12E93B20EB6B2900807EF4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC12E93A20EB6B2900807EF4 /* Assets.xcassets */; };
+		FC12E93E20EB6B2900807EF4 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC12E93C20EB6B2900807EF4 /* LaunchScreen.storyboard */; };
+		FC12E94120EB6B2900807EF4 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = FC12E94020EB6B2900807EF4 /* main.m */; };
+		FC12E94A20EB6B6800807EF4 /* libpaddle-mobile.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FC12E94820EB6B6800807EF4 /* libpaddle-mobile.a */; };
+		FC12E94D20EB6BBB00807EF4 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC12E94C20EB6BBB00807EF4 /* libstdc++.tbd */; };
+		FC12E95420EB6C0D00807EF4 /* apple.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC12E95320EB6C0D00807EF4 /* apple.jpg */; };
+		FC51640120EF758D00636C28 /* params in Resources */ = {isa = PBXBuildFile; fileRef = FC5163FF20EF758D00636C28 /* params */; };
+		FC51640220EF758D00636C28 /* model in Resources */ = {isa = PBXBuildFile; fileRef = FC51640020EF758D00636C28 /* model */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
-		FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PaddleMobileDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		FC086BC720E783AF00D85EF7 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
-		FC086BC820E783AF00D85EF7 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
-		FC086BCA20E783AF00D85EF7 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
-		FC086BCB20E783AF00D85EF7 /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
-		FC086BCE20E783AF00D85EF7 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
-		FC086BD020E783B100D85EF7 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		FC086BD320E783B100D85EF7 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
-		FC086BD520E783B100D85EF7 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		FC086BD620E783B100D85EF7 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+		FC12E92E20EB6B2800807EF4 /* PaddleMobileDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PaddleMobileDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		FC12E93120EB6B2800807EF4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		FC12E93220EB6B2800807EF4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
+		FC12E93420EB6B2800807EF4 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
+		FC12E93520EB6B2800807EF4 /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
+		FC12E93820EB6B2800807EF4 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		FC12E93A20EB6B2900807EF4 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		FC12E93D20EB6B2900807EF4 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		FC12E93F20EB6B2900807EF4 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		FC12E94020EB6B2900807EF4 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+		FC12E94820EB6B6800807EF4 /* libpaddle-mobile.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = "libpaddle-mobile.a"; sourceTree = "<group>"; };
+		FC12E94920EB6B6800807EF4 /* PaddleMobile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobile.h; sourceTree = "<group>"; };
+		FC12E94C20EB6BBB00807EF4 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
+		FC12E95320EB6C0D00807EF4 /* apple.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = apple.jpg; sourceTree = "<group>"; };
+		FC5163FF20EF758D00636C28 /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
+		FC51640020EF758D00636C28 /* model */ = {isa = PBXFileReference; lastKnownFileType = file; path = model; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
-		FC086BC120E783AF00D85EF7 /* Frameworks */ = {
+		FC12E92B20EB6B2800807EF4 /* Frameworks */ = {
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FC12E94D20EB6BBB00807EF4 /* libstdc++.tbd in Frameworks */,
+				FC12E94A20EB6B6800807EF4 /* libpaddle-mobile.a in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXFrameworksBuildPhase section */

 /* Begin PBXGroup section */
-		FC086BBB20E783AF00D85EF7 = {
+		FC12E92520EB6B2800807EF4 = {
 			isa = PBXGroup;
 			children = (
-				FC086BC620E783AF00D85EF7 /* PaddleMobileDemo */,
-				FC086BC520E783AF00D85EF7 /* Products */,
+				FC12E93020EB6B2800807EF4 /* PaddleMobileDemo */,
+				FC12E92F20EB6B2800807EF4 /* Products */,
+				FC12E94B20EB6BBB00807EF4 /* Frameworks */,
 			);
 			sourceTree = "<group>";
 		};
-		FC086BC520E783AF00D85EF7 /* Products */ = {
+		FC12E92F20EB6B2800807EF4 /* Products */ = {
 			isa = PBXGroup;
 			children = (
-				FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */,
+				FC12E92E20EB6B2800807EF4 /* PaddleMobileDemo.app */,
 			);
 			name = Products;
 			sourceTree = "<group>";
 		};
-		FC086BC620E783AF00D85EF7 /* PaddleMobileDemo */ = {
+		FC12E93020EB6B2800807EF4 /* PaddleMobileDemo */ = {
 			isa = PBXGroup;
 			children = (
-				FC086BC720E783AF00D85EF7 /* AppDelegate.h */,
-				FC086BC820E783AF00D85EF7 /* AppDelegate.m */,
-				FC086BCA20E783AF00D85EF7 /* ViewController.h */,
-				FC086BCB20E783AF00D85EF7 /* ViewController.m */,
-				FC086BCD20E783AF00D85EF7 /* Main.storyboard */,
-				FC086BD020E783B100D85EF7 /* Assets.xcassets */,
-				FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */,
-				FC086BD520E783B100D85EF7 /* Info.plist */,
-				FC086BD620E783B100D85EF7 /* main.m */,
+				FC12E94720EB6B6800807EF4 /* PaddleMobile */,
+				FC5163FE20EF758D00636C28 /* googlenet_combine */,
+				FC12E95320EB6C0D00807EF4 /* apple.jpg */,
+				FC12E93120EB6B2800807EF4 /* AppDelegate.h */,
+				FC12E93220EB6B2800807EF4 /* AppDelegate.m */,
+				FC12E93420EB6B2800807EF4 /* ViewController.h */,
+				FC12E93520EB6B2800807EF4 /* ViewController.m */,
+				FC12E93720EB6B2800807EF4 /* Main.storyboard */,
+				FC12E93A20EB6B2900807EF4 /* Assets.xcassets */,
+				FC12E93C20EB6B2900807EF4 /* LaunchScreen.storyboard */,
+				FC12E93F20EB6B2900807EF4 /* Info.plist */,
+				FC12E94020EB6B2900807EF4 /* main.m */,
 			);
 			path = PaddleMobileDemo;
 			sourceTree = "<group>";
 		};
+		FC12E94720EB6B6800807EF4 /* PaddleMobile */ = {
+			isa = PBXGroup;
+			children = (
+				FC12E94820EB6B6800807EF4 /* libpaddle-mobile.a */,
+				FC12E94920EB6B6800807EF4 /* PaddleMobile.h */,
+			);
+			path = PaddleMobile;
+			sourceTree = "<group>";
+		};
+		FC12E94B20EB6BBB00807EF4 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				FC12E94C20EB6BBB00807EF4 /* libstdc++.tbd */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		FC5163FE20EF758D00636C28 /* googlenet_combine */ = {
+			isa = PBXGroup;
+			children = (
+				FC5163FF20EF758D00636C28 /* params */,
+				FC51640020EF758D00636C28 /* model */,
+			);
+			path = googlenet_combine;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */

 /* Begin PBXNativeTarget section */
-		FC086BC320E783AF00D85EF7 /* PaddleMobileDemo */ = {
+		FC12E92D20EB6B2800807EF4 /* PaddleMobileDemo */ = {
 			isa = PBXNativeTarget;
-			buildConfigurationList = FC086BDA20E783B100D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */;
+			buildConfigurationList = FC12E94420EB6B2900807EF4 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */;
 			buildPhases = (
-				FC086BC020E783AF00D85EF7 /* Sources */,
-				FC086BC120E783AF00D85EF7 /* Frameworks */,
-				FC086BC220E783AF00D85EF7 /* Resources */,
+				FC12E92A20EB6B2800807EF4 /* Sources */,
+				FC12E92B20EB6B2800807EF4 /* Frameworks */,
+				FC12E92C20EB6B2800807EF4 /* Resources */,
 			);
 			buildRules = (
 			);
@@ -88,24 +131,24 @@
 			);
 			name = PaddleMobileDemo;
 			productName = PaddleMobileDemo;
-			productReference = FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */;
+			productReference = FC12E92E20EB6B2800807EF4 /* PaddleMobileDemo.app */;
 			productType = "com.apple.product-type.application";
 		};
 /* End PBXNativeTarget section */

 /* Begin PBXProject section */
-		FC086BBC20E783AF00D85EF7 /* Project object */ = {
+		FC12E92620EB6B2800807EF4 /* Project object */ = {
 			isa = PBXProject;
 			attributes = {
-				LastUpgradeCheck = 0930;
+				LastUpgradeCheck = 0940;
 				ORGANIZATIONNAME = orange;
 				TargetAttributes = {
-					FC086BC320E783AF00D85EF7 = {
-						CreatedOnToolsVersion = 9.3.1;
+					FC12E92D20EB6B2800807EF4 = {
+						CreatedOnToolsVersion = 9.4.1;
 					};
 				};
 			};
-			buildConfigurationList = FC086BBF20E783AF00D85EF7 /* Build configuration list for PBXProject "PaddleMobileDemo" */;
+			buildConfigurationList = FC12E92920EB6B2800807EF4 /* Build configuration list for PBXProject "PaddleMobileDemo" */;
 			compatibilityVersion = "Xcode 9.3";
 			developmentRegion = en;
 			hasScannedForEncodings = 0;
@@ -113,55 +156,58 @@
 				en,
 				Base,
 			);
-			mainGroup = FC086BBB20E783AF00D85EF7;
-			productRefGroup = FC086BC520E783AF00D85EF7 /* Products */;
+			mainGroup = FC12E92520EB6B2800807EF4;
+			productRefGroup = FC12E92F20EB6B2800807EF4 /* Products */;
 			projectDirPath = "";
 			projectRoot = "";
 			targets = (
-				FC086BC320E783AF00D85EF7 /* PaddleMobileDemo */,
+				FC12E92D20EB6B2800807EF4 /* PaddleMobileDemo */,
 			);
 		};
 /* End PBXProject section */

 /* Begin PBXResourcesBuildPhase section */
-		FC086BC220E783AF00D85EF7 /* Resources */ = {
+		FC12E92C20EB6B2800807EF4 /* Resources */ = {
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				FC086BD420E783B100D85EF7 /* LaunchScreen.storyboard in Resources */,
-				FC086BD120E783B100D85EF7 /* Assets.xcassets in Resources */,
-				FC086BCF20E783AF00D85EF7 /* Main.storyboard in Resources */,
+				FC51640220EF758D00636C28 /* model in Resources */,
+				FC51640120EF758D00636C28 /* params in Resources */,
+				FC12E93E20EB6B2900807EF4 /* LaunchScreen.storyboard in Resources */,
+				FC12E93B20EB6B2900807EF4 /* Assets.xcassets in Resources */,
+				FC12E95420EB6C0D00807EF4 /* apple.jpg in Resources */,
+				FC12E93920EB6B2800807EF4 /* Main.storyboard in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXResourcesBuildPhase section */

 /* Begin PBXSourcesBuildPhase section */
-		FC086BC020E783AF00D85EF7 /* Sources */ = {
+		FC12E92A20EB6B2800807EF4 /* Sources */ = {
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				FC086BCC20E783AF00D85EF7 /* ViewController.m in Sources */,
-				FC086BD720E783B100D85EF7 /* main.m in Sources */,
-				FC086BC920E783AF00D85EF7 /* AppDelegate.m in Sources */,
+				FC12E93620EB6B2800807EF4 /* ViewController.m in Sources */,
+				FC12E94120EB6B2900807EF4 /* main.m in Sources */,
+				FC12E93320EB6B2800807EF4 /* AppDelegate.m in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXSourcesBuildPhase section */

 /* Begin PBXVariantGroup section */
-		FC086BCD20E783AF00D85EF7 /* Main.storyboard */ = {
+		FC12E93720EB6B2800807EF4 /* Main.storyboard */ = {
 			isa = PBXVariantGroup;
 			children = (
-				FC086BCE20E783AF00D85EF7 /* Base */,
+				FC12E93820EB6B2800807EF4 /* Base */,
 			);
 			name = Main.storyboard;
 			sourceTree = "<group>";
 		};
-		FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */ = {
+		FC12E93C20EB6B2900807EF4 /* LaunchScreen.storyboard */ = {
 			isa = PBXVariantGroup;
 			children = (
-				FC086BD320E783B100D85EF7 /* Base */,
+				FC12E93D20EB6B2900807EF4 /* Base */,
 			);
 			name = LaunchScreen.storyboard;
 			sourceTree = "<group>";
@@ -169,7 +215,7 @@
 /* End PBXVariantGroup section */

 /* Begin XCBuildConfiguration section */
-		FC086BD820E783B100D85EF7 /* Debug */ = {
+		FC12E94220EB6B2900807EF4 /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
@@ -220,14 +266,14 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
 				SDKROOT = iphoneos;
 			};
 			name = Debug;
 		};
-		FC086BD920E783B100D85EF7 /* Release */ = {
+		FC12E94320EB6B2900807EF4 /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
@@ -272,41 +318,53 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
 			name = Release;
 		};
-		FC086BDB20E783B100D85EF7 /* Debug */ = {
+		FC12E94520EB6B2900807EF4 /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/PaddleMobileDemo/PaddleMobile",
+				);
 				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
-		FC086BDC20E783B100D85EF7 /* Release */ = {
+		FC12E94620EB6B2900807EF4 /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
 				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				ENABLE_BITCODE = NO;
 				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/PaddleMobileDemo/PaddleMobile",
+				);
 				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -316,25 +374,25 @@
 /* End XCBuildConfiguration section */

 /* Begin XCConfigurationList section */
-		FC086BBF20E783AF00D85EF7 /* Build configuration list for PBXProject "PaddleMobileDemo" */ = {
+		FC12E92920EB6B2800807EF4 /* Build configuration list for PBXProject "PaddleMobileDemo" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
-				FC086BD820E783B100D85EF7 /* Debug */,
-				FC086BD920E783B100D85EF7 /* Release */,
+				FC12E94220EB6B2900807EF4 /* Debug */,
+				FC12E94320EB6B2900807EF4 /* Release */,
 			);
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
-		FC086BDA20E783B100D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */ = {
+		FC12E94420EB6B2900807EF4 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
-				FC086BDB20E783B100D85EF7 /* Debug */,
-				FC086BDC20E783B100D85EF7 /* Release */,
+				FC12E94520EB6B2900807EF4 /* Debug */,
+				FC12E94620EB6B2900807EF4 /* Release */,
 			);
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
 	};
-	rootObject = FC086BBC20E783AF00D85EF7 /* Project object */;
+	rootObject = FC12E92620EB6B2800807EF4 /* Project object */;
 }
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
--- a/ios/PaddleMobile.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/ios/PaddleMobile.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
--- a/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
--- a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@property (strong, nonatomic) UIWindow *window;
+
+
+@end
+
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #import "AppDelegate.h"


--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist
--- a/ios/PaddleMobile/PaddleMobile/PaddleMobile.h
+++ b/ios/PaddleMobile/PaddleMobile/PaddleMobile.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
- 
+
 http://www.apache.org/licenses/LICENSE-2.0
- 
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
+
 #import <CoreImage/CoreImage.h>
 #import <Foundation/Foundation.h>

 @interface PaddleMobile : NSObject

-+ (instancetype)sharedInstance;
+/*
+    创建对象
+*/
+- (instancetype)init;
+
+/*
+    load 模型, 开辟内存
+*/
 - (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
- (NSArray *)predict:(CGImageRef)image;
+
+/*
+  加载散开形式的模型, 需传入模型的目录
+*/
+- (BOOL)load:(NSString *)modelAndWeightPath;
+
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+    进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    清理内存
+*/
 - (void)clear;

 @end
--- a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import <UIKit/UIKit.h>
+
+@interface ViewController : UIViewController
+
+
+@end
+
--- a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import "PaddleMobile.h"
+#import "ViewController.h"
+
+@interface ViewController ()
+
+@end
+
+@implementation ViewController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    PaddleMobile *pam = [[PaddleMobile alloc] init];
+    NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"model" ofType:nil];
+    NSString *paramPath = [[NSBundle mainBundle] pathForResource:@"params" ofType:nil];
+    if (modelPath.length == 0 || paramPath.length == 0) {
+        NSLog(@" need model and param");
+        return;
+    }
+    
+    if ([pam load:modelPath andWeightsPath:paramPath]) {
+        NSLog(@"load success");
+        UIImage *inputImage = [UIImage imageNamed:@"apple.jpg"];
+        if (!inputImage) {
+            NSLog(@" input image is nil");
+            return;
+        }
+        
+        NSDate *beforeDate = [NSDate date];
+        NSArray *res = [pam predict:inputImage.CGImage dim:@[@1, @3, @224, @224] means:@[@148, @148, @148] scale:1.0];
+        NSLog(@"res: %@", res);
+        NSLog(@"elapsed time: %f", [[NSDate date] timeIntervalSinceDate:beforeDate]);
+    }
+}
+
+@end
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/main.m
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/main.m
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #import <UIKit/UIKit.h>
 #import "AppDelegate.h"

--- a/doc/development_doc.md
+++ b/doc/development_doc.md
@@ -4,9 +4,11 @@

 ## 编译

-### 一. 使用 build.sh 编译
-
 ```sh
+
+# 在 paddle-mobile 目录下:
+cd tools
+
 sh build.sh ios

 # 如果只想编译某个特定模型的 op, 则需执行以下命令
@@ -16,40 +18,33 @@ sh build.sh ios googlenet
 cd ../build/release/ios/build

 ```
+#### 常见问题:

-### 二. 使用 xcode 编译
-
-我们提供了 ios 开发更为熟悉的 xcode 编译环境:
-在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+1. No iOS SDK's found in default search path ...

-### 三. 集成
+    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
+    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")

-#### 如使用 c++ 接口
-将
+## 集成

 ```
+将上一步生成的:
 libpaddle-mobile.a
-io.h
-program.h
-types.h
-lod_tensor.h
-tensor.h
-```
-拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释

-#### 如使用 oc 接口
-将在xcode 编译生成的
-```
-libPaddleMobile.a
+/src/ios_io/ 下的
 PaddleMobile.h
 ```
-拖入工程, 接口如下:
+拖入工程
+
+#### oc 接口
+
+接口如下:

 ```
 /*
-	创建单例对象
+	创建对象
 */
-+ (instancetype)sharedInstance;
+- (instancetype)init;

 /*
 	load 模型, 开辟内存
@@ -59,12 +54,12 @@ PaddleMobile.h
 /*
 	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
 */
- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;

 /*
 	进行预测
 */
- (NSArray *)predict:(CGImageRef)image;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;

 /*
 	清理内存
@@ -74,7 +69,8 @@ PaddleMobile.h
 ```


-#Android开发文档
+# Android开发文档
+
 用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：

 - 基于Docker容器编译
@@ -196,27 +192,51 @@ which to test :
 ##部署
 Android应用可通过JNI接口调用底层C/C++，paddle-mobile对外提供的JNI接口如下：

-##### 1 load接口 加载模型参数
-
+##### 1 load接口  加载模型参数
+- 用于加载参数文件分散的模型
 ```
-/*
-*@param modelPath 模型文件路径
-*@return jboolean
-*/
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
-                                                          jclass thiz,
-                                                          jstring modelPath);
+/**
+     * Load seperated parameters
+     * @param modelDir
+     * @return
+     */
+    public static native boolean load(String modelDir);
 ```
+- 用于加载参数文件合并的模型文件
+```
+/**
+     * Load combined parameters
+     * @param modelPath
+     * @param paramPath
+     * @return
+     */
+    public static native boolean loadCombined(String modelPath,String paramPath);

+```
 ##### 2 predict接口 执行预测
-
+- 接受预处理过的RGB数组的predict接口
 ```
 /**
 *@param buf 输入数据
 *@return 输出数据
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predict(
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
    JNIEnv *env, jclass thiz, jfloatArray buf);
 ```
+- 接受原始yuv数据的predict接口
+```
+ /**
+     *
+     * @param buf yuv420格式的字节数组
+     * @param imgWidth yuv数据的宽
+     * @param imgHeight yuv数据的高
+     * @param ddims 输入数据的形状
+     * @param meanValues 模型训练时各通道的均值
+     * @return
+     */
+
+    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues);
+
+```
 ##### 3 clear接口 销毁实例、清理内存操作

 ```

--- a/ios/PaddleMobile.xcworkspace/contents.xcworkspacedata
+++ b/ios/PaddleMobile.xcworkspace/contents.xcworkspacedata
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "group:PaddleMobileDemo/PaddleMobileDemo.xcodeproj">
-   </FileRef>
-   <FileRef
-      location = "group:PaddleMobile/PaddleMobile.xcodeproj">
-   </FileRef>
-</Workspace>
--- a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
+++ b/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
-<?xml version="1.0" encoding="UTF-8"?>
-<Bucket
-   type = "0"
-   version = "2.0">
-</Bucket>
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.pbxproj
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.pbxproj
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		FC086BB420E7839B00D85EF7 /* PaddleMobile.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BB320E7839B00D85EF7 /* PaddleMobile.m */; };
-		FC086BB520E7839B00D85EF7 /* PaddleMobile.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = FC086BB220E7839B00D85EF7 /* PaddleMobile.h */; };
-		FC086DC620E7841E00D85EF7 /* t_malloc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086CFE20E7841E00D85EF7 /* t_malloc.cpp */; };
-		FC086DC720E7841E00D85EF7 /* lrn_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0320E7841E00D85EF7 /* lrn_op.cpp */; };
-		FC086DC820E7841E00D85EF7 /* sigmoid_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0520E7841E00D85EF7 /* sigmoid_op.cpp */; };
-		FC086DC920E7841E00D85EF7 /* box_coder_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0620E7841E00D85EF7 /* box_coder_op.cpp */; };
-		FC086DCA20E7841E00D85EF7 /* feed_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0720E7841E00D85EF7 /* feed_op.cpp */; };
-		FC086DCB20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0A20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp */; };
-		FC086DCC20E7841E00D85EF7 /* reshape_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0B20E7841E00D85EF7 /* reshape_op.cpp */; };
-		FC086DCD20E7841E00D85EF7 /* concat_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0C20E7841E00D85EF7 /* concat_op.cpp */; };
-		FC086DCE20E7841E00D85EF7 /* transpose_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0D20E7841E00D85EF7 /* transpose_op.cpp */; };
-		FC086DCF20E7841E00D85EF7 /* prior_box_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0E20E7841E00D85EF7 /* prior_box_op.cpp */; };
-		FC086DD020E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0F20E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp */; };
-		FC086DD120E7841E00D85EF7 /* softmax_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1520E7841E00D85EF7 /* softmax_op.cpp */; };
-		FC086DD220E7841E00D85EF7 /* depthwise_conv_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1720E7841E00D85EF7 /* depthwise_conv_op.cpp */; };
-		FC086DD320E7841E00D85EF7 /* elementwise_add_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1A20E7841E00D85EF7 /* elementwise_add_op.cpp */; };
-		FC086DD420E7841E00D85EF7 /* gemm.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1F20E7841E00D85EF7 /* gemm.cpp */; };
-		FC086DD520E7841E00D85EF7 /* pool_2x2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2220E7841E00D85EF7 /* pool_2x2.cpp */; };
-		FC086DD620E7841E00D85EF7 /* im2col.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2320E7841E00D85EF7 /* im2col.cpp */; };
-		FC086DD720E7841E00D85EF7 /* vol2col.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2620E7841E00D85EF7 /* vol2col.cpp */; };
-		FC086DD820E7841E00D85EF7 /* math_function.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2720E7841E00D85EF7 /* math_function.cpp */; };
-		FC086DD920E7841E00D85EF7 /* pool_3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2820E7841E00D85EF7 /* pool_3x3.cpp */; };
-		FC086DDA20E7841E00D85EF7 /* pooling.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2B20E7841E00D85EF7 /* pooling.cpp */; };
-		FC086DDB20E7841E00D85EF7 /* depthwise_conv_3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2D20E7841E00D85EF7 /* depthwise_conv_3x3.cpp */; };
-		FC086DDC20E7841E00D85EF7 /* softmax.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2F20E7841E00D85EF7 /* softmax.cpp */; };
-		FC086DDD20E7841E00D85EF7 /* fetch_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3420E7841E00D85EF7 /* fetch_op.cpp */; };
-		FC086DDE20E7841E00D85EF7 /* fusion_conv_add.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3520E7841E00D85EF7 /* fusion_conv_add.cpp */; };
-		FC086DDF20E7841E00D85EF7 /* op_param.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3620E7841E00D85EF7 /* op_param.cpp */; };
-		FC086DE020E7841E00D85EF7 /* mul_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3A20E7841E00D85EF7 /* mul_op.cpp */; };
-		FC086DE120E7841E00D85EF7 /* relu_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3B20E7841E00D85EF7 /* relu_op.cpp */; };
-		FC086DE220E7841E00D85EF7 /* conv_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3C20E7841E00D85EF7 /* conv_op.cpp */; };
-		FC086DE320E7841E00D85EF7 /* fusion_fc_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3D20E7841E00D85EF7 /* fusion_fc_op.cpp */; };
-		FC086DE420E7841E00D85EF7 /* batchnorm_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D4020E7841E00D85EF7 /* batchnorm_op.cpp */; };
-		FC086DE520E7841E00D85EF7 /* pool_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D4220E7841E00D85EF7 /* pool_op.cpp */; };
-		FC086DE620E7841E00D85EF7 /* multiclass_nms_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D4420E7841E00D85EF7 /* multiclass_nms_op.cpp */; };
-		FC086DE720E7841E00D85EF7 /* acl_tensor.cc in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5220E7841E00D85EF7 /* acl_tensor.cc */; };
-		FC086DE820E7841E00D85EF7 /* acl_operator.cc in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5320E7841E00D85EF7 /* acl_operator.cc */; };
-		FC086DE920E7841E00D85EF7 /* conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5420E7841E00D85EF7 /* conv_kernel.cpp */; };
-		FC086DEA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5620E7841E00D85EF7 /* conv_add_kernel.cpp */; };
-		FC086DEB20E7841E00D85EF7 /* relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5820E7841E00D85EF7 /* relu_kernel.cpp */; };
-		FC086DEC20E7841E00D85EF7 /* mul_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5920E7841E00D85EF7 /* mul_kernel.cpp */; };
-		FC086DED20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5A20E7841E00D85EF7 /* elementwise_add_kernel.cpp */; };
-		FC086DEE20E7841E00D85EF7 /* softmax_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5C20E7841E00D85EF7 /* softmax_kernel.cpp */; };
-		FC086DEF20E7841E00D85EF7 /* concat_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5D20E7841E00D85EF7 /* concat_kernel.cpp */; };
-		FC086DF020E7841E00D85EF7 /* pool_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5E20E7841E00D85EF7 /* pool_kernel.cpp */; };
-		FC086DF120E7841E00D85EF7 /* reshape_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5F20E7841E00D85EF7 /* reshape_kernel.cpp */; };
-		FC086DF220E7841E00D85EF7 /* lrn_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6020E7841E00D85EF7 /* lrn_kernel.cpp */; };
-		FC086DF320E7841E00D85EF7 /* fushion_fc_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6120E7841E00D85EF7 /* fushion_fc_kernel.cpp */; };
-		FC086DF420E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6220E7841E00D85EF7 /* batchnorm_kernel.cpp */; };
-		FC086DF520E7841E00D85EF7 /* conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6F20E7841E00D85EF7 /* conv_kernel.cpp */; };
-		FC086DF620E7841E00D85EF7 /* prior_box_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7420E7841E00D85EF7 /* prior_box_kernel.cpp */; };
-		FC086DF720E7841E00D85EF7 /* conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7520E7841E00D85EF7 /* conv_kernel.cpp */; };
-		FC086DF820E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7620E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp */; };
-		FC086DF920E7841E00D85EF7 /* box_coder_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7720E7841E00D85EF7 /* box_coder_kernel.cpp */; };
-		FC086DFA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7820E7841E00D85EF7 /* conv_add_kernel.cpp */; };
-		FC086DFB20E7841E00D85EF7 /* sigmoid_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7920E7841E00D85EF7 /* sigmoid_kernel.cpp */; };
-		FC086DFC20E7841E00D85EF7 /* relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7A20E7841E00D85EF7 /* relu_kernel.cpp */; };
-		FC086DFD20E7841E00D85EF7 /* mul_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7B20E7841E00D85EF7 /* mul_kernel.cpp */; };
-		FC086DFE20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7C20E7841E00D85EF7 /* elementwise_add_kernel.cpp */; };
-		FC086DFF20E7841E00D85EF7 /* conv_add_relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7D20E7841E00D85EF7 /* conv_add_relu_kernel.cpp */; };
-		FC086E0020E7841E00D85EF7 /* transpose_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7E20E7841E00D85EF7 /* transpose_kernel.cpp */; };
-		FC086E0120E7841E00D85EF7 /* depthwise_conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7F20E7841E00D85EF7 /* depthwise_conv_kernel.cpp */; };
-		FC086E0220E7841E00D85EF7 /* softmax_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8020E7841E00D85EF7 /* softmax_kernel.cpp */; };
-		FC086E0320E7841E00D85EF7 /* concat_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8120E7841E00D85EF7 /* concat_kernel.cpp */; };
-		FC086E0420E7841E00D85EF7 /* fusion_fc_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8220E7841E00D85EF7 /* fusion_fc_kernel.cpp */; };
-		FC086E0520E7841E00D85EF7 /* pool_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8320E7841E00D85EF7 /* pool_kernel.cpp */; };
-		FC086E0620E7841E00D85EF7 /* reshape_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8420E7841E00D85EF7 /* reshape_kernel.cpp */; };
-		FC086E0720E7841E00D85EF7 /* lrn_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8520E7841E00D85EF7 /* lrn_kernel.cpp */; };
-		FC086E0820E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8620E7841E00D85EF7 /* batchnorm_kernel.cpp */; };
-		FC086E0920E7841E00D85EF7 /* multiclass_nms_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8720E7841E00D85EF7 /* multiclass_nms_kernel.cpp */; };
-		FC086E0A20E7841E00D85EF7 /* framework.pb-c.c in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8B20E7841E00D85EF7 /* framework.pb-c.c */; };
-		FC086E0B20E7841E00D85EF7 /* tensor_util.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8C20E7841E00D85EF7 /* tensor_util.cpp */; };
-		FC086E0C20E7841E00D85EF7 /* operator.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8F20E7841E00D85EF7 /* operator.cpp */; };
-		FC086E0D20E7841E00D85EF7 /* ddim.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9020E7841E00D85EF7 /* ddim.cpp */; };
-		FC086E0E20E7841E00D85EF7 /* scope.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9320E7841E00D85EF7 /* scope.cpp */; };
-		FC086E0F20E7841E00D85EF7 /* attribute.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9920E7841E00D85EF7 /* attribute.cpp */; };
-		FC086E1020E7841E00D85EF7 /* op_desc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9C20E7841E00D85EF7 /* op_desc.cpp */; };
-		FC086E1120E7841E00D85EF7 /* program_desc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9D20E7841E00D85EF7 /* program_desc.cpp */; };
-		FC086E1220E7841E00D85EF7 /* node.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DA320E7841E00D85EF7 /* node.cpp */; };
-		FC086E1320E7841E00D85EF7 /* program_optimize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DA620E7841E00D85EF7 /* program_optimize.cpp */; };
-		FC086E1420E7841E00D85EF7 /* block_desc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DA720E7841E00D85EF7 /* block_desc.cpp */; };
-		FC086E1520E7841E00D85EF7 /* lod_tensor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DAB20E7841E00D85EF7 /* lod_tensor.cpp */; };
-		FC086E1620E7841E00D85EF7 /* io.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DB320E7841E00D85EF7 /* io.cpp */; };
-		FC086E1720E7841E00D85EF7 /* types.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DB620E7841E00D85EF7 /* types.cpp */; };
-		FC086E1820E7841E00D85EF7 /* openmp-fix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DBA20E7841E00D85EF7 /* openmp-fix.cpp */; };
-		FC086E1920E7841E00D85EF7 /* protobuf-c.c in Sources */ = {isa = PBXBuildFile; fileRef = FC086DC120E7841E00D85EF7 /* protobuf-c.c */; };
-		FC086E1A20E7841E00D85EF7 /* paddle_mobile_jni.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DC420E7841E00D85EF7 /* paddle_mobile_jni.cpp */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXCopyFilesBuildPhase section */
-		FC086BAD20E7839B00D85EF7 /* CopyFiles */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 2147483647;
-			dstPath = "include/$(PRODUCT_NAME)";
-			dstSubfolderSpec = 16;
-			files = (
-				FC086BB520E7839B00D85EF7 /* PaddleMobile.h in CopyFiles */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
-		FC086BAF20E7839B00D85EF7 /* libPaddleMobile.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libPaddleMobile.a; sourceTree = BUILT_PRODUCTS_DIR; };
-		FC086BB220E7839B00D85EF7 /* PaddleMobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PaddleMobile.h; sourceTree = "<group>"; };
-		FC086BB320E7839B00D85EF7 /* PaddleMobile.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PaddleMobile.m; sourceTree = "<group>"; };
-		FC086CFE20E7841E00D85EF7 /* t_malloc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = t_malloc.cpp; sourceTree = "<group>"; };
-		FC086CFF20E7841E00D85EF7 /* t_malloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = t_malloc.h; sourceTree = "<group>"; };
-		FC086D0120E7841E00D85EF7 /* feed_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = feed_op.h; sourceTree = "<group>"; };
-		FC086D0220E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_conv_add_bn_relu_op.h; sourceTree = "<group>"; };
-		FC086D0320E7841E00D85EF7 /* lrn_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_op.cpp; sourceTree = "<group>"; };
-		FC086D0420E7841E00D85EF7 /* op_param.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_param.h; sourceTree = "<group>"; };
-		FC086D0520E7841E00D85EF7 /* sigmoid_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sigmoid_op.cpp; sourceTree = "<group>"; };
-		FC086D0620E7841E00D85EF7 /* box_coder_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = box_coder_op.cpp; sourceTree = "<group>"; };
-		FC086D0720E7841E00D85EF7 /* feed_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = feed_op.cpp; sourceTree = "<group>"; };
-		FC086D0820E7841E00D85EF7 /* mul_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mul_op.h; sourceTree = "<group>"; };
-		FC086D0920E7841E00D85EF7 /* prior_box_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prior_box_op.h; sourceTree = "<group>"; };
-		FC086D0A20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_conv_add_bn_relu_op.cpp; sourceTree = "<group>"; };
-		FC086D0B20E7841E00D85EF7 /* reshape_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_op.cpp; sourceTree = "<group>"; };
-		FC086D0C20E7841E00D85EF7 /* concat_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_op.cpp; sourceTree = "<group>"; };
-		FC086D0D20E7841E00D85EF7 /* transpose_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = transpose_op.cpp; sourceTree = "<group>"; };
-		FC086D0E20E7841E00D85EF7 /* prior_box_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prior_box_op.cpp; sourceTree = "<group>"; };
-		FC086D0F20E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_conv_add_relu_op.cpp; sourceTree = "<group>"; };
-		FC086D1020E7841E00D85EF7 /* lrn_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lrn_op.h; sourceTree = "<group>"; };
-		FC086D1120E7841E00D85EF7 /* multiclass_nms_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = multiclass_nms_op.h; sourceTree = "<group>"; };
-		FC086D1220E7841E00D85EF7 /* relu_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = relu_op.h; sourceTree = "<group>"; };
-		FC086D1320E7841E00D85EF7 /* fusion_conv_add.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_conv_add.h; sourceTree = "<group>"; };
-		FC086D1420E7841E00D85EF7 /* conv_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_op.h; sourceTree = "<group>"; };
-		FC086D1520E7841E00D85EF7 /* softmax_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_op.cpp; sourceTree = "<group>"; };
-		FC086D1620E7841E00D85EF7 /* pool_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_op.h; sourceTree = "<group>"; };
-		FC086D1720E7841E00D85EF7 /* depthwise_conv_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = depthwise_conv_op.cpp; sourceTree = "<group>"; };
-		FC086D1820E7841E00D85EF7 /* softmax_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = softmax_op.h; sourceTree = "<group>"; };
-		FC086D1920E7841E00D85EF7 /* elementwise_add_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = elementwise_add_op.h; sourceTree = "<group>"; };
-		FC086D1A20E7841E00D85EF7 /* elementwise_add_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elementwise_add_op.cpp; sourceTree = "<group>"; };
-		FC086D1B20E7841E00D85EF7 /* fetch_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fetch_op.h; sourceTree = "<group>"; };
-		FC086D1D20E7841E00D85EF7 /* elementwise_op_function.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = elementwise_op_function.h; sourceTree = "<group>"; };
-		FC086D1E20E7841E00D85EF7 /* softmax.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = softmax.h; sourceTree = "<group>"; };
-		FC086D1F20E7841E00D85EF7 /* gemm.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gemm.cpp; sourceTree = "<group>"; };
-		FC086D2020E7841E00D85EF7 /* math_function.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = math_function.h; sourceTree = "<group>"; };
-		FC086D2120E7841E00D85EF7 /* conv_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_func.h; sourceTree = "<group>"; };
-		FC086D2220E7841E00D85EF7 /* pool_2x2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_2x2.cpp; sourceTree = "<group>"; };
-		FC086D2320E7841E00D85EF7 /* im2col.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = im2col.cpp; sourceTree = "<group>"; };
-		FC086D2420E7841E00D85EF7 /* gemm.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gemm.h; sourceTree = "<group>"; };
-		FC086D2520E7841E00D85EF7 /* im2col.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = im2col.h; sourceTree = "<group>"; };
-		FC086D2620E7841E00D85EF7 /* vol2col.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vol2col.cpp; sourceTree = "<group>"; };
-		FC086D2720E7841E00D85EF7 /* math_function.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = math_function.cpp; sourceTree = "<group>"; };
-		FC086D2820E7841E00D85EF7 /* pool_3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_3x3.cpp; sourceTree = "<group>"; };
-		FC086D2920E7841E00D85EF7 /* pool_2x2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_2x2.h; sourceTree = "<group>"; };
-		FC086D2A20E7841E00D85EF7 /* depthwise_conv_3x3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_3x3.h; sourceTree = "<group>"; };
-		FC086D2B20E7841E00D85EF7 /* pooling.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pooling.cpp; sourceTree = "<group>"; };
-		FC086D2C20E7841E00D85EF7 /* pool_3x3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_3x3.h; sourceTree = "<group>"; };
-		FC086D2D20E7841E00D85EF7 /* depthwise_conv_3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = depthwise_conv_3x3.cpp; sourceTree = "<group>"; };
-		FC086D2E20E7841E00D85EF7 /* vol2col.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vol2col.h; sourceTree = "<group>"; };
-		FC086D2F20E7841E00D85EF7 /* softmax.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax.cpp; sourceTree = "<group>"; };
-		FC086D3020E7841E00D85EF7 /* transform.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transform.h; sourceTree = "<group>"; };
-		FC086D3120E7841E00D85EF7 /* pooling.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pooling.h; sourceTree = "<group>"; };
-		FC086D3220E7841E00D85EF7 /* math_func_neon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = math_func_neon.h; sourceTree = "<group>"; };
-		FC086D3320E7841E00D85EF7 /* fusion_conv_add_relu_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_conv_add_relu_op.h; sourceTree = "<group>"; };
-		FC086D3420E7841E00D85EF7 /* fetch_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fetch_op.cpp; sourceTree = "<group>"; };
-		FC086D3520E7841E00D85EF7 /* fusion_conv_add.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_conv_add.cpp; sourceTree = "<group>"; };
-		FC086D3620E7841E00D85EF7 /* op_param.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = op_param.cpp; sourceTree = "<group>"; };
-		FC086D3720E7841E00D85EF7 /* transpose_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transpose_op.h; sourceTree = "<group>"; };
-		FC086D3820E7841E00D85EF7 /* fusion_fc_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_fc_op.h; sourceTree = "<group>"; };
-		FC086D3920E7841E00D85EF7 /* batchnorm_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = batchnorm_op.h; sourceTree = "<group>"; };
-		FC086D3A20E7841E00D85EF7 /* mul_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_op.cpp; sourceTree = "<group>"; };
-		FC086D3B20E7841E00D85EF7 /* relu_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_op.cpp; sourceTree = "<group>"; };
-		FC086D3C20E7841E00D85EF7 /* conv_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_op.cpp; sourceTree = "<group>"; };
-		FC086D3D20E7841E00D85EF7 /* fusion_fc_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_fc_op.cpp; sourceTree = "<group>"; };
-		FC086D3E20E7841E00D85EF7 /* box_coder_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = box_coder_op.h; sourceTree = "<group>"; };
-		FC086D3F20E7841E00D85EF7 /* concat_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = concat_op.h; sourceTree = "<group>"; };
-		FC086D4020E7841E00D85EF7 /* batchnorm_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batchnorm_op.cpp; sourceTree = "<group>"; };
-		FC086D4120E7841E00D85EF7 /* reshape_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reshape_op.h; sourceTree = "<group>"; };
-		FC086D4220E7841E00D85EF7 /* pool_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_op.cpp; sourceTree = "<group>"; };
-		FC086D4320E7841E00D85EF7 /* sigmoid_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sigmoid_op.h; sourceTree = "<group>"; };
-		FC086D4420E7841E00D85EF7 /* multiclass_nms_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = multiclass_nms_op.cpp; sourceTree = "<group>"; };
-		FC086D4620E7841E00D85EF7 /* relu_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = relu_kernel.h; sourceTree = "<group>"; };
-		FC086D4720E7841E00D85EF7 /* multiclass_nms_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = multiclass_nms_kernel.h; sourceTree = "<group>"; };
-		FC086D4820E7841E00D85EF7 /* depthwise_conv_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_kernel.h; sourceTree = "<group>"; };
-		FC086D4920E7841E00D85EF7 /* lrn_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lrn_kernel.h; sourceTree = "<group>"; };
-		FC086D4A20E7841E00D85EF7 /* pool_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_kernel.h; sourceTree = "<group>"; };
-		FC086D4B20E7841E00D85EF7 /* fusion_fc_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_fc_kernel.h; sourceTree = "<group>"; };
-		FC086D4C20E7841E00D85EF7 /* box_coder_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = box_coder_kernel.h; sourceTree = "<group>"; };
-		FC086D4D20E7841E00D85EF7 /* concat_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = concat_kernel.h; sourceTree = "<group>"; };
-		FC086D4E20E7841E00D85EF7 /* mul_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mul_kernel.h; sourceTree = "<group>"; };
-		FC086D4F20E7841E00D85EF7 /* softmax_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = softmax_kernel.h; sourceTree = "<group>"; };
-		FC086D5020E7841E00D85EF7 /* batchnorm_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = batchnorm_kernel.h; sourceTree = "<group>"; };
-		FC086D5220E7841E00D85EF7 /* acl_tensor.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = acl_tensor.cc; sourceTree = "<group>"; };
-		FC086D5320E7841E00D85EF7 /* acl_operator.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = acl_operator.cc; sourceTree = "<group>"; };
-		FC086D5420E7841E00D85EF7 /* conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5520E7841E00D85EF7 /* acl_operator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = acl_operator.h; sourceTree = "<group>"; };
-		FC086D5620E7841E00D85EF7 /* conv_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5720E7841E00D85EF7 /* acl_tensor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = acl_tensor.h; sourceTree = "<group>"; };
-		FC086D5820E7841E00D85EF7 /* relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5920E7841E00D85EF7 /* mul_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5A20E7841E00D85EF7 /* elementwise_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elementwise_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5C20E7841E00D85EF7 /* softmax_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5D20E7841E00D85EF7 /* concat_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5E20E7841E00D85EF7 /* pool_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5F20E7841E00D85EF7 /* reshape_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6020E7841E00D85EF7 /* lrn_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6120E7841E00D85EF7 /* fushion_fc_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fushion_fc_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6220E7841E00D85EF7 /* batchnorm_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batchnorm_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6320E7841E00D85EF7 /* elementwise_add_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = elementwise_add_kernel.h; sourceTree = "<group>"; };
-		FC086D6520E7841E00D85EF7 /* conv_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_arm_func.h; sourceTree = "<group>"; };
-		FC086D6620E7841E00D85EF7 /* conv_add_bn_relu_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_bn_relu_func.h; sourceTree = "<group>"; };
-		FC086D6720E7841E00D85EF7 /* conv_add_relu_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_relu_arm_func.h; sourceTree = "<group>"; };
-		FC086D6820E7841E00D85EF7 /* depthwise_conv_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_arm_func.h; sourceTree = "<group>"; };
-		FC086D6920E7841E00D85EF7 /* batchnorm_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = batchnorm_arm_func.h; sourceTree = "<group>"; };
-		FC086D6A20E7841E00D85EF7 /* conv_add_relu_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_relu_kernel.h; sourceTree = "<group>"; };
-		FC086D6B20E7841E00D85EF7 /* reshape_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reshape_kernel.h; sourceTree = "<group>"; };
-		FC086D6C20E7841E00D85EF7 /* transpose_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transpose_kernel.h; sourceTree = "<group>"; };
-		FC086D6D20E7841E00D85EF7 /* conv_add_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_kernel.h; sourceTree = "<group>"; };
-		FC086D6F20E7841E00D85EF7 /* conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7020E7841E00D85EF7 /* conv_add_bn_relu_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_bn_relu_kernel.h; sourceTree = "<group>"; };
-		FC086D7120E7841E00D85EF7 /* prior_box_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prior_box_kernel.h; sourceTree = "<group>"; };
-		FC086D7220E7841E00D85EF7 /* conv_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_kernel.h; sourceTree = "<group>"; };
-		FC086D7420E7841E00D85EF7 /* prior_box_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prior_box_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7520E7841E00D85EF7 /* conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7620E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_bn_relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7720E7841E00D85EF7 /* box_coder_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = box_coder_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7820E7841E00D85EF7 /* conv_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7920E7841E00D85EF7 /* sigmoid_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sigmoid_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7A20E7841E00D85EF7 /* relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7B20E7841E00D85EF7 /* mul_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7C20E7841E00D85EF7 /* elementwise_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elementwise_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7D20E7841E00D85EF7 /* conv_add_relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7E20E7841E00D85EF7 /* transpose_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = transpose_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7F20E7841E00D85EF7 /* depthwise_conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = depthwise_conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8020E7841E00D85EF7 /* softmax_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8120E7841E00D85EF7 /* concat_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8220E7841E00D85EF7 /* fusion_fc_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_fc_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8320E7841E00D85EF7 /* pool_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8420E7841E00D85EF7 /* reshape_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8520E7841E00D85EF7 /* lrn_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8620E7841E00D85EF7 /* batchnorm_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batchnorm_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8720E7841E00D85EF7 /* multiclass_nms_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = multiclass_nms_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8820E7841E00D85EF7 /* sigmoid_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sigmoid_kernel.h; sourceTree = "<group>"; };
-		FC086D8920E7841E00D85EF7 /* depthwise_conv_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_op.h; sourceTree = "<group>"; };
-		FC086D8B20E7841E00D85EF7 /* framework.pb-c.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "framework.pb-c.c"; sourceTree = "<group>"; };
-		FC086D8C20E7841E00D85EF7 /* tensor_util.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tensor_util.cpp; sourceTree = "<group>"; };
-		FC086D8D20E7841E00D85EF7 /* operator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = operator.h; sourceTree = "<group>"; };
-		FC086D8E20E7841E00D85EF7 /* op_info.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_info.h; sourceTree = "<group>"; };
-		FC086D8F20E7841E00D85EF7 /* operator.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = operator.cpp; sourceTree = "<group>"; };
-		FC086D9020E7841E00D85EF7 /* ddim.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ddim.cpp; sourceTree = "<group>"; };
-		FC086D9120E7841E00D85EF7 /* tensor_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensor_util.h; sourceTree = "<group>"; };
-		FC086D9220E7841E00D85EF7 /* variable.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = variable.h; sourceTree = "<group>"; };
-		FC086D9320E7841E00D85EF7 /* scope.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scope.cpp; sourceTree = "<group>"; };
-		FC086D9420E7841E00D85EF7 /* data_layout.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = data_layout.h; sourceTree = "<group>"; };
-		FC086D9520E7841E00D85EF7 /* lod_tensor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lod_tensor.h; sourceTree = "<group>"; };
-		FC086D9620E7841E00D85EF7 /* dim.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dim.h; sourceTree = "<group>"; };
-		FC086D9720E7841E00D85EF7 /* framework.pb-c.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "framework.pb-c.h"; sourceTree = "<group>"; };
-		FC086D9820E7841E00D85EF7 /* op_kernel_type.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_kernel_type.h; sourceTree = "<group>"; };
-		FC086D9920E7841E00D85EF7 /* attribute.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = attribute.cpp; sourceTree = "<group>"; };
-		FC086D9A20E7841E00D85EF7 /* op_proto_maker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_proto_maker.h; sourceTree = "<group>"; };
-		FC086D9C20E7841E00D85EF7 /* op_desc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = op_desc.cpp; sourceTree = "<group>"; };
-		FC086D9D20E7841E00D85EF7 /* program_desc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = program_desc.cpp; sourceTree = "<group>"; };
-		FC086D9E20E7841E00D85EF7 /* var_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = var_desc.h; sourceTree = "<group>"; };
-		FC086D9F20E7841E00D85EF7 /* program_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = program_desc.h; sourceTree = "<group>"; };
-		FC086DA020E7841E00D85EF7 /* op_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_desc.h; sourceTree = "<group>"; };
-		FC086DA220E7841E00D85EF7 /* fusion_op_register.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_op_register.h; sourceTree = "<group>"; };
-		FC086DA320E7841E00D85EF7 /* node.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = node.cpp; sourceTree = "<group>"; };
-		FC086DA420E7841E00D85EF7 /* node.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = node.h; sourceTree = "<group>"; };
-		FC086DA520E7841E00D85EF7 /* program_optimize.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = program_optimize.h; sourceTree = "<group>"; };
-		FC086DA620E7841E00D85EF7 /* program_optimize.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = program_optimize.cpp; sourceTree = "<group>"; };
-		FC086DA720E7841E00D85EF7 /* block_desc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = block_desc.cpp; sourceTree = "<group>"; };
-		FC086DA820E7841E00D85EF7 /* program.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = program.h; sourceTree = "<group>"; };
-		FC086DA920E7841E00D85EF7 /* tensor_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensor_desc.h; sourceTree = "<group>"; };
-		FC086DAA20E7841E00D85EF7 /* block_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = block_desc.h; sourceTree = "<group>"; };
-		FC086DAB20E7841E00D85EF7 /* lod_tensor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lod_tensor.cpp; sourceTree = "<group>"; };
-		FC086DAC20E7841E00D85EF7 /* framework.proto */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = framework.proto; sourceTree = "<group>"; };
-		FC086DAD20E7841E00D85EF7 /* ddim.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ddim.h; sourceTree = "<group>"; };
-		FC086DAE20E7841E00D85EF7 /* attribute.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = attribute.h; sourceTree = "<group>"; };
-		FC086DAF20E7841E00D85EF7 /* scope.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = scope.h; sourceTree = "<group>"; };
-		FC086DB020E7841E00D85EF7 /* tensor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensor.h; sourceTree = "<group>"; };
-		FC086DB120E7841E00D85EF7 /* op_registry.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_registry.h; sourceTree = "<group>"; };
-		FC086DB320E7841E00D85EF7 /* io.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = io.cpp; sourceTree = "<group>"; };
-		FC086DB420E7841E00D85EF7 /* io.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = io.h; sourceTree = "<group>"; };
-		FC086DB620E7841E00D85EF7 /* types.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = types.cpp; sourceTree = "<group>"; };
-		FC086DB720E7841E00D85EF7 /* threadpool.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = threadpool.h; sourceTree = "<group>"; };
-		FC086DB820E7841E00D85EF7 /* types.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = types.h; sourceTree = "<group>"; };
-		FC086DB920E7841E00D85EF7 /* protobuf-c.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "protobuf-c.h"; sourceTree = "<group>"; };
-		FC086DBA20E7841E00D85EF7 /* openmp-fix.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = "openmp-fix.cpp"; sourceTree = "<group>"; };
-		FC086DBB20E7841E00D85EF7 /* dep_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dep_core.h; sourceTree = "<group>"; };
-		FC086DBC20E7841E00D85EF7 /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = "<group>"; };
-		FC086DBD20E7841E00D85EF7 /* log.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = log.h; sourceTree = "<group>"; };
-		FC086DBE20E7841E00D85EF7 /* macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = macros.h; sourceTree = "<group>"; };
-		FC086DBF20E7841E00D85EF7 /* type_define.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = type_define.h; sourceTree = "<group>"; };
-		FC086DC020E7841E00D85EF7 /* enforce.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = enforce.h; sourceTree = "<group>"; };
-		FC086DC120E7841E00D85EF7 /* protobuf-c.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "protobuf-c.c"; sourceTree = "<group>"; };
-		FC086DC220E7841E00D85EF7 /* variant.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = variant.h; sourceTree = "<group>"; };
-		FC086DC420E7841E00D85EF7 /* paddle_mobile_jni.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = paddle_mobile_jni.cpp; sourceTree = "<group>"; };
-		FC086DC520E7841E00D85EF7 /* paddle_mobile_jni.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = paddle_mobile_jni.h; sourceTree = "<group>"; };
-		FC2428A520E78DF20095932F /* MacroDefine.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = MacroDefine.h; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		FC086BAC20E7839B00D85EF7 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		FC086BA620E7839B00D85EF7 = {
-			isa = PBXGroup;
-			children = (
-				FC086BB120E7839B00D85EF7 /* PaddleMobile */,
-				FC086BB020E7839B00D85EF7 /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		FC086BB020E7839B00D85EF7 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FC086BAF20E7839B00D85EF7 /* libPaddleMobile.a */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FC086BB120E7839B00D85EF7 /* PaddleMobile */ = {
-			isa = PBXGroup;
-			children = (
-				FC086CFC20E7841E00D85EF7 /* src */,
-				FC086BB220E7839B00D85EF7 /* PaddleMobile.h */,
-				FC086BB320E7839B00D85EF7 /* PaddleMobile.m */,
-				FC2428A520E78DF20095932F /* MacroDefine.h */,
-			);
-			path = PaddleMobile;
-			sourceTree = "<group>";
-		};
-		FC086CFC20E7841E00D85EF7 /* src */ = {
-			isa = PBXGroup;
-			children = (
-				FC086CFD20E7841E00D85EF7 /* memory */,
-				FC086D0020E7841E00D85EF7 /* operators */,
-				FC086D8A20E7841E00D85EF7 /* framework */,
-				FC086DB220E7841E00D85EF7 /* io */,
-				FC086DB520E7841E00D85EF7 /* common */,
-				FC086DC320E7841E00D85EF7 /* jni */,
-			);
-			name = src;
-			path = ../../../src;
-			sourceTree = "<group>";
-		};
-		FC086CFD20E7841E00D85EF7 /* memory */ = {
-			isa = PBXGroup;
-			children = (
-				FC086CFE20E7841E00D85EF7 /* t_malloc.cpp */,
-				FC086CFF20E7841E00D85EF7 /* t_malloc.h */,
-			);
-			path = memory;
-			sourceTree = "<group>";
-		};
-		FC086D0020E7841E00D85EF7 /* operators */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D0120E7841E00D85EF7 /* feed_op.h */,
-				FC086D0220E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.h */,
-				FC086D0320E7841E00D85EF7 /* lrn_op.cpp */,
-				FC086D0420E7841E00D85EF7 /* op_param.h */,
-				FC086D0520E7841E00D85EF7 /* sigmoid_op.cpp */,
-				FC086D0620E7841E00D85EF7 /* box_coder_op.cpp */,
-				FC086D0720E7841E00D85EF7 /* feed_op.cpp */,
-				FC086D0820E7841E00D85EF7 /* mul_op.h */,
-				FC086D0920E7841E00D85EF7 /* prior_box_op.h */,
-				FC086D0A20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp */,
-				FC086D0B20E7841E00D85EF7 /* reshape_op.cpp */,
-				FC086D0C20E7841E00D85EF7 /* concat_op.cpp */,
-				FC086D0D20E7841E00D85EF7 /* transpose_op.cpp */,
-				FC086D0E20E7841E00D85EF7 /* prior_box_op.cpp */,
-				FC086D0F20E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp */,
-				FC086D1020E7841E00D85EF7 /* lrn_op.h */,
-				FC086D1120E7841E00D85EF7 /* multiclass_nms_op.h */,
-				FC086D1220E7841E00D85EF7 /* relu_op.h */,
-				FC086D1320E7841E00D85EF7 /* fusion_conv_add.h */,
-				FC086D1420E7841E00D85EF7 /* conv_op.h */,
-				FC086D1520E7841E00D85EF7 /* softmax_op.cpp */,
-				FC086D1620E7841E00D85EF7 /* pool_op.h */,
-				FC086D1720E7841E00D85EF7 /* depthwise_conv_op.cpp */,
-				FC086D1820E7841E00D85EF7 /* softmax_op.h */,
-				FC086D1920E7841E00D85EF7 /* elementwise_add_op.h */,
-				FC086D1A20E7841E00D85EF7 /* elementwise_add_op.cpp */,
-				FC086D1B20E7841E00D85EF7 /* fetch_op.h */,
-				FC086D1C20E7841E00D85EF7 /* math */,
-				FC086D3320E7841E00D85EF7 /* fusion_conv_add_relu_op.h */,
-				FC086D3420E7841E00D85EF7 /* fetch_op.cpp */,
-				FC086D3520E7841E00D85EF7 /* fusion_conv_add.cpp */,
-				FC086D3620E7841E00D85EF7 /* op_param.cpp */,
-				FC086D3720E7841E00D85EF7 /* transpose_op.h */,
-				FC086D3820E7841E00D85EF7 /* fusion_fc_op.h */,
-				FC086D3920E7841E00D85EF7 /* batchnorm_op.h */,
-				FC086D3A20E7841E00D85EF7 /* mul_op.cpp */,
-				FC086D3B20E7841E00D85EF7 /* relu_op.cpp */,
-				FC086D3C20E7841E00D85EF7 /* conv_op.cpp */,
-				FC086D3D20E7841E00D85EF7 /* fusion_fc_op.cpp */,
-				FC086D3E20E7841E00D85EF7 /* box_coder_op.h */,
-				FC086D3F20E7841E00D85EF7 /* concat_op.h */,
-				FC086D4020E7841E00D85EF7 /* batchnorm_op.cpp */,
-				FC086D4120E7841E00D85EF7 /* reshape_op.h */,
-				FC086D4220E7841E00D85EF7 /* pool_op.cpp */,
-				FC086D4320E7841E00D85EF7 /* sigmoid_op.h */,
-				FC086D4420E7841E00D85EF7 /* multiclass_nms_op.cpp */,
-				FC086D4520E7841E00D85EF7 /* kernel */,
-				FC086D8920E7841E00D85EF7 /* depthwise_conv_op.h */,
-			);
-			path = operators;
-			sourceTree = "<group>";
-		};
-		FC086D1C20E7841E00D85EF7 /* math */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D1D20E7841E00D85EF7 /* elementwise_op_function.h */,
-				FC086D1E20E7841E00D85EF7 /* softmax.h */,
-				FC086D1F20E7841E00D85EF7 /* gemm.cpp */,
-				FC086D2020E7841E00D85EF7 /* math_function.h */,
-				FC086D2120E7841E00D85EF7 /* conv_func.h */,
-				FC086D2220E7841E00D85EF7 /* pool_2x2.cpp */,
-				FC086D2320E7841E00D85EF7 /* im2col.cpp */,
-				FC086D2420E7841E00D85EF7 /* gemm.h */,
-				FC086D2520E7841E00D85EF7 /* im2col.h */,
-				FC086D2620E7841E00D85EF7 /* vol2col.cpp */,
-				FC086D2720E7841E00D85EF7 /* math_function.cpp */,
-				FC086D2820E7841E00D85EF7 /* pool_3x3.cpp */,
-				FC086D2920E7841E00D85EF7 /* pool_2x2.h */,
-				FC086D2A20E7841E00D85EF7 /* depthwise_conv_3x3.h */,
-				FC086D2B20E7841E00D85EF7 /* pooling.cpp */,
-				FC086D2C20E7841E00D85EF7 /* pool_3x3.h */,
-				FC086D2D20E7841E00D85EF7 /* depthwise_conv_3x3.cpp */,
-				FC086D2E20E7841E00D85EF7 /* vol2col.h */,
-				FC086D2F20E7841E00D85EF7 /* softmax.cpp */,
-				FC086D3020E7841E00D85EF7 /* transform.h */,
-				FC086D3120E7841E00D85EF7 /* pooling.h */,
-				FC086D3220E7841E00D85EF7 /* math_func_neon.h */,
-			);
-			path = math;
-			sourceTree = "<group>";
-		};
-		FC086D4520E7841E00D85EF7 /* kernel */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D4620E7841E00D85EF7 /* relu_kernel.h */,
-				FC086D4720E7841E00D85EF7 /* multiclass_nms_kernel.h */,
-				FC086D4820E7841E00D85EF7 /* depthwise_conv_kernel.h */,
-				FC086D4920E7841E00D85EF7 /* lrn_kernel.h */,
-				FC086D4A20E7841E00D85EF7 /* pool_kernel.h */,
-				FC086D4B20E7841E00D85EF7 /* fusion_fc_kernel.h */,
-				FC086D4C20E7841E00D85EF7 /* box_coder_kernel.h */,
-				FC086D4D20E7841E00D85EF7 /* concat_kernel.h */,
-				FC086D4E20E7841E00D85EF7 /* mul_kernel.h */,
-				FC086D4F20E7841E00D85EF7 /* softmax_kernel.h */,
-				FC086D5020E7841E00D85EF7 /* batchnorm_kernel.h */,
-				FC086D5120E7841E00D85EF7 /* mali */,
-				FC086D6320E7841E00D85EF7 /* elementwise_add_kernel.h */,
-				FC086D6420E7841E00D85EF7 /* central-arm-func */,
-				FC086D6A20E7841E00D85EF7 /* conv_add_relu_kernel.h */,
-				FC086D6B20E7841E00D85EF7 /* reshape_kernel.h */,
-				FC086D6C20E7841E00D85EF7 /* transpose_kernel.h */,
-				FC086D6D20E7841E00D85EF7 /* conv_add_kernel.h */,
-				FC086D6E20E7841E00D85EF7 /* fpga */,
-				FC086D7020E7841E00D85EF7 /* conv_add_bn_relu_kernel.h */,
-				FC086D7120E7841E00D85EF7 /* prior_box_kernel.h */,
-				FC086D7220E7841E00D85EF7 /* conv_kernel.h */,
-				FC086D7320E7841E00D85EF7 /* arm */,
-				FC086D8820E7841E00D85EF7 /* sigmoid_kernel.h */,
-			);
-			path = kernel;
-			sourceTree = "<group>";
-		};
-		FC086D5120E7841E00D85EF7 /* mali */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D5220E7841E00D85EF7 /* acl_tensor.cc */,
-				FC086D5320E7841E00D85EF7 /* acl_operator.cc */,
-				FC086D5420E7841E00D85EF7 /* conv_kernel.cpp */,
-				FC086D5520E7841E00D85EF7 /* acl_operator.h */,
-				FC086D5620E7841E00D85EF7 /* conv_add_kernel.cpp */,
-				FC086D5720E7841E00D85EF7 /* acl_tensor.h */,
-				FC086D5820E7841E00D85EF7 /* relu_kernel.cpp */,
-				FC086D5920E7841E00D85EF7 /* mul_kernel.cpp */,
-				FC086D5A20E7841E00D85EF7 /* elementwise_add_kernel.cpp */,
-				FC086D5B20E7841E00D85EF7 /* ACL_Android */,
-				FC086D5C20E7841E00D85EF7 /* softmax_kernel.cpp */,
-				FC086D5D20E7841E00D85EF7 /* concat_kernel.cpp */,
-				FC086D5E20E7841E00D85EF7 /* pool_kernel.cpp */,
-				FC086D5F20E7841E00D85EF7 /* reshape_kernel.cpp */,
-				FC086D6020E7841E00D85EF7 /* lrn_kernel.cpp */,
-				FC086D6120E7841E00D85EF7 /* fushion_fc_kernel.cpp */,
-				FC086D6220E7841E00D85EF7 /* batchnorm_kernel.cpp */,
-			);
-			path = mali;
-			sourceTree = "<group>";
-		};
-		FC086D5B20E7841E00D85EF7 /* ACL_Android */ = {
-			isa = PBXGroup;
-			children = (
-			);
-			path = ACL_Android;
-			sourceTree = "<group>";
-		};
-		FC086D6420E7841E00D85EF7 /* central-arm-func */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D6520E7841E00D85EF7 /* conv_arm_func.h */,
-				FC086D6620E7841E00D85EF7 /* conv_add_bn_relu_func.h */,
-				FC086D6720E7841E00D85EF7 /* conv_add_relu_arm_func.h */,
-				FC086D6820E7841E00D85EF7 /* depthwise_conv_arm_func.h */,
-				FC086D6920E7841E00D85EF7 /* batchnorm_arm_func.h */,
-			);
-			path = "central-arm-func";
-			sourceTree = "<group>";
-		};
-		FC086D6E20E7841E00D85EF7 /* fpga */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D6F20E7841E00D85EF7 /* conv_kernel.cpp */,
-			);
-			path = fpga;
-			sourceTree = "<group>";
-		};
-		FC086D7320E7841E00D85EF7 /* arm */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D7420E7841E00D85EF7 /* prior_box_kernel.cpp */,
-				FC086D7520E7841E00D85EF7 /* conv_kernel.cpp */,
-				FC086D7620E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp */,
-				FC086D7720E7841E00D85EF7 /* box_coder_kernel.cpp */,
-				FC086D7820E7841E00D85EF7 /* conv_add_kernel.cpp */,
-				FC086D7920E7841E00D85EF7 /* sigmoid_kernel.cpp */,
-				FC086D7A20E7841E00D85EF7 /* relu_kernel.cpp */,
-				FC086D7B20E7841E00D85EF7 /* mul_kernel.cpp */,
-				FC086D7C20E7841E00D85EF7 /* elementwise_add_kernel.cpp */,
-				FC086D7D20E7841E00D85EF7 /* conv_add_relu_kernel.cpp */,
-				FC086D7E20E7841E00D85EF7 /* transpose_kernel.cpp */,
-				FC086D7F20E7841E00D85EF7 /* depthwise_conv_kernel.cpp */,
-				FC086D8020E7841E00D85EF7 /* softmax_kernel.cpp */,
-				FC086D8120E7841E00D85EF7 /* concat_kernel.cpp */,
-				FC086D8220E7841E00D85EF7 /* fusion_fc_kernel.cpp */,
-				FC086D8320E7841E00D85EF7 /* pool_kernel.cpp */,
-				FC086D8420E7841E00D85EF7 /* reshape_kernel.cpp */,
-				FC086D8520E7841E00D85EF7 /* lrn_kernel.cpp */,
-				FC086D8620E7841E00D85EF7 /* batchnorm_kernel.cpp */,
-				FC086D8720E7841E00D85EF7 /* multiclass_nms_kernel.cpp */,
-			);
-			path = arm;
-			sourceTree = "<group>";
-		};
-		FC086D8A20E7841E00D85EF7 /* framework */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D8B20E7841E00D85EF7 /* framework.pb-c.c */,
-				FC086D8C20E7841E00D85EF7 /* tensor_util.cpp */,
-				FC086D8D20E7841E00D85EF7 /* operator.h */,
-				FC086D8E20E7841E00D85EF7 /* op_info.h */,
-				FC086D8F20E7841E00D85EF7 /* operator.cpp */,
-				FC086D9020E7841E00D85EF7 /* ddim.cpp */,
-				FC086D9120E7841E00D85EF7 /* tensor_util.h */,
-				FC086D9220E7841E00D85EF7 /* variable.h */,
-				FC086D9320E7841E00D85EF7 /* scope.cpp */,
-				FC086D9420E7841E00D85EF7 /* data_layout.h */,
-				FC086D9520E7841E00D85EF7 /* lod_tensor.h */,
-				FC086D9620E7841E00D85EF7 /* dim.h */,
-				FC086D9720E7841E00D85EF7 /* framework.pb-c.h */,
-				FC086D9820E7841E00D85EF7 /* op_kernel_type.h */,
-				FC086D9920E7841E00D85EF7 /* attribute.cpp */,
-				FC086D9A20E7841E00D85EF7 /* op_proto_maker.h */,
-				FC086D9B20E7841E00D85EF7 /* program */,
-				FC086DAB20E7841E00D85EF7 /* lod_tensor.cpp */,
-				FC086DAC20E7841E00D85EF7 /* framework.proto */,
-				FC086DAD20E7841E00D85EF7 /* ddim.h */,
-				FC086DAE20E7841E00D85EF7 /* attribute.h */,
-				FC086DAF20E7841E00D85EF7 /* scope.h */,
-				FC086DB020E7841E00D85EF7 /* tensor.h */,
-				FC086DB120E7841E00D85EF7 /* op_registry.h */,
-			);
-			path = framework;
-			sourceTree = "<group>";
-		};
-		FC086D9B20E7841E00D85EF7 /* program */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D9C20E7841E00D85EF7 /* op_desc.cpp */,
-				FC086D9D20E7841E00D85EF7 /* program_desc.cpp */,
-				FC086D9E20E7841E00D85EF7 /* var_desc.h */,
-				FC086D9F20E7841E00D85EF7 /* program_desc.h */,
-				FC086DA020E7841E00D85EF7 /* op_desc.h */,
-				FC086DA120E7841E00D85EF7 /* program-optimize */,
-				FC086DA720E7841E00D85EF7 /* block_desc.cpp */,
-				FC086DA820E7841E00D85EF7 /* program.h */,
-				FC086DA920E7841E00D85EF7 /* tensor_desc.h */,
-				FC086DAA20E7841E00D85EF7 /* block_desc.h */,
-			);
-			path = program;
-			sourceTree = "<group>";
-		};
-		FC086DA120E7841E00D85EF7 /* program-optimize */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DA220E7841E00D85EF7 /* fusion_op_register.h */,
-				FC086DA320E7841E00D85EF7 /* node.cpp */,
-				FC086DA420E7841E00D85EF7 /* node.h */,
-				FC086DA520E7841E00D85EF7 /* program_optimize.h */,
-				FC086DA620E7841E00D85EF7 /* program_optimize.cpp */,
-			);
-			path = "program-optimize";
-			sourceTree = "<group>";
-		};
-		FC086DB220E7841E00D85EF7 /* io */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DB320E7841E00D85EF7 /* io.cpp */,
-				FC086DB420E7841E00D85EF7 /* io.h */,
-			);
-			path = io;
-			sourceTree = "<group>";
-		};
-		FC086DB520E7841E00D85EF7 /* common */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DB620E7841E00D85EF7 /* types.cpp */,
-				FC086DB720E7841E00D85EF7 /* threadpool.h */,
-				FC086DB820E7841E00D85EF7 /* types.h */,
-				FC086DB920E7841E00D85EF7 /* protobuf-c.h */,
-				FC086DBA20E7841E00D85EF7 /* openmp-fix.cpp */,
-				FC086DBB20E7841E00D85EF7 /* dep_core.h */,
-				FC086DBC20E7841E00D85EF7 /* common.h */,
-				FC086DBD20E7841E00D85EF7 /* log.h */,
-				FC086DBE20E7841E00D85EF7 /* macros.h */,
-				FC086DBF20E7841E00D85EF7 /* type_define.h */,
-				FC086DC020E7841E00D85EF7 /* enforce.h */,
-				FC086DC120E7841E00D85EF7 /* protobuf-c.c */,
-				FC086DC220E7841E00D85EF7 /* variant.h */,
-			);
-			path = common;
-			sourceTree = "<group>";
-		};
-		FC086DC320E7841E00D85EF7 /* jni */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DC420E7841E00D85EF7 /* paddle_mobile_jni.cpp */,
-				FC086DC520E7841E00D85EF7 /* paddle_mobile_jni.h */,
-			);
-			path = jni;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		FC086BAE20E7839B00D85EF7 /* PaddleMobile */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FC086BB820E7839B00D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobile" */;
-			buildPhases = (
-				FC086BAB20E7839B00D85EF7 /* Sources */,
-				FC086BAC20E7839B00D85EF7 /* Frameworks */,
-				FC086BAD20E7839B00D85EF7 /* CopyFiles */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = PaddleMobile;
-			productName = PaddleMobile;
-			productReference = FC086BAF20E7839B00D85EF7 /* libPaddleMobile.a */;
-			productType = "com.apple.product-type.library.static";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FC086BA720E7839B00D85EF7 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0930;
-				ORGANIZATIONNAME = orange;
-				TargetAttributes = {
-					FC086BAE20E7839B00D85EF7 = {
-						CreatedOnToolsVersion = 9.3.1;
-					};
-				};
-			};
-			buildConfigurationList = FC086BAA20E7839B00D85EF7 /* Build configuration list for PBXProject "PaddleMobile" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-			);
-			mainGroup = FC086BA620E7839B00D85EF7;
-			productRefGroup = FC086BB020E7839B00D85EF7 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FC086BAE20E7839B00D85EF7 /* PaddleMobile */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FC086BAB20E7839B00D85EF7 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC086DCE20E7841E00D85EF7 /* transpose_op.cpp in Sources */,
-				FC086DD820E7841E00D85EF7 /* math_function.cpp in Sources */,
-				FC086DE120E7841E00D85EF7 /* relu_op.cpp in Sources */,
-				FC086E0920E7841E00D85EF7 /* multiclass_nms_kernel.cpp in Sources */,
-				FC086E0220E7841E00D85EF7 /* softmax_kernel.cpp in Sources */,
-				FC086DCD20E7841E00D85EF7 /* concat_op.cpp in Sources */,
-				FC086DCA20E7841E00D85EF7 /* feed_op.cpp in Sources */,
-				FC086DD920E7841E00D85EF7 /* pool_3x3.cpp in Sources */,
-				FC086DF020E7841E00D85EF7 /* pool_kernel.cpp in Sources */,
-				FC086E1A20E7841E00D85EF7 /* paddle_mobile_jni.cpp in Sources */,
-				FC086DF620E7841E00D85EF7 /* prior_box_kernel.cpp in Sources */,
-				FC086DC620E7841E00D85EF7 /* t_malloc.cpp in Sources */,
-				FC086DD320E7841E00D85EF7 /* elementwise_add_op.cpp in Sources */,
-				FC086E0E20E7841E00D85EF7 /* scope.cpp in Sources */,
-				FC086DDE20E7841E00D85EF7 /* fusion_conv_add.cpp in Sources */,
-				FC086DFF20E7841E00D85EF7 /* conv_add_relu_kernel.cpp in Sources */,
-				FC086DD720E7841E00D85EF7 /* vol2col.cpp in Sources */,
-				FC086E0B20E7841E00D85EF7 /* tensor_util.cpp in Sources */,
-				FC086E1320E7841E00D85EF7 /* program_optimize.cpp in Sources */,
-				FC086DF820E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp in Sources */,
-				FC086DC820E7841E00D85EF7 /* sigmoid_op.cpp in Sources */,
-				FC086E0D20E7841E00D85EF7 /* ddim.cpp in Sources */,
-				FC086E0120E7841E00D85EF7 /* depthwise_conv_kernel.cpp in Sources */,
-				FC086DDB20E7841E00D85EF7 /* depthwise_conv_3x3.cpp in Sources */,
-				FC086BB420E7839B00D85EF7 /* PaddleMobile.m in Sources */,
-				FC086E1420E7841E00D85EF7 /* block_desc.cpp in Sources */,
-				FC086DC920E7841E00D85EF7 /* box_coder_op.cpp in Sources */,
-				FC086DDF20E7841E00D85EF7 /* op_param.cpp in Sources */,
-				FC086DD520E7841E00D85EF7 /* pool_2x2.cpp in Sources */,
-				FC086DFD20E7841E00D85EF7 /* mul_kernel.cpp in Sources */,
-				FC086E0C20E7841E00D85EF7 /* operator.cpp in Sources */,
-				FC086DE020E7841E00D85EF7 /* mul_op.cpp in Sources */,
-				FC086E1520E7841E00D85EF7 /* lod_tensor.cpp in Sources */,
-				FC086DE720E7841E00D85EF7 /* acl_tensor.cc in Sources */,
-				FC086DDD20E7841E00D85EF7 /* fetch_op.cpp in Sources */,
-				FC086DE220E7841E00D85EF7 /* conv_op.cpp in Sources */,
-				FC086DDA20E7841E00D85EF7 /* pooling.cpp in Sources */,
-				FC086DEF20E7841E00D85EF7 /* concat_kernel.cpp in Sources */,
-				FC086DE520E7841E00D85EF7 /* pool_op.cpp in Sources */,
-				FC086DE820E7841E00D85EF7 /* acl_operator.cc in Sources */,
-				FC086DF220E7841E00D85EF7 /* lrn_kernel.cpp in Sources */,
-				FC086E0F20E7841E00D85EF7 /* attribute.cpp in Sources */,
-				FC086E0520E7841E00D85EF7 /* pool_kernel.cpp in Sources */,
-				FC086DDC20E7841E00D85EF7 /* softmax.cpp in Sources */,
-				FC086E0420E7841E00D85EF7 /* fusion_fc_kernel.cpp in Sources */,
-				FC086E1220E7841E00D85EF7 /* node.cpp in Sources */,
-				FC086E0820E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */,
-				FC086DCC20E7841E00D85EF7 /* reshape_op.cpp in Sources */,
-				FC086DE920E7841E00D85EF7 /* conv_kernel.cpp in Sources */,
-				FC086E1920E7841E00D85EF7 /* protobuf-c.c in Sources */,
-				FC086DF920E7841E00D85EF7 /* box_coder_kernel.cpp in Sources */,
-				FC086DF120E7841E00D85EF7 /* reshape_kernel.cpp in Sources */,
-				FC086DF720E7841E00D85EF7 /* conv_kernel.cpp in Sources */,
-				FC086DCF20E7841E00D85EF7 /* prior_box_op.cpp in Sources */,
-				FC086E1720E7841E00D85EF7 /* types.cpp in Sources */,
-				FC086DF320E7841E00D85EF7 /* fushion_fc_kernel.cpp in Sources */,
-				FC086DEB20E7841E00D85EF7 /* relu_kernel.cpp in Sources */,
-				FC086E0620E7841E00D85EF7 /* reshape_kernel.cpp in Sources */,
-				FC086E0720E7841E00D85EF7 /* lrn_kernel.cpp in Sources */,
-				FC086DE620E7841E00D85EF7 /* multiclass_nms_op.cpp in Sources */,
-				FC086E1120E7841E00D85EF7 /* program_desc.cpp in Sources */,
-				FC086E0320E7841E00D85EF7 /* concat_kernel.cpp in Sources */,
-				FC086DEC20E7841E00D85EF7 /* mul_kernel.cpp in Sources */,
-				FC086DFB20E7841E00D85EF7 /* sigmoid_kernel.cpp in Sources */,
-				FC086E1820E7841E00D85EF7 /* openmp-fix.cpp in Sources */,
-				FC086DF420E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */,
-				FC086DEA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */,
-				FC086E1620E7841E00D85EF7 /* io.cpp in Sources */,
-				FC086DD620E7841E00D85EF7 /* im2col.cpp in Sources */,
-				FC086DC720E7841E00D85EF7 /* lrn_op.cpp in Sources */,
-				FC086DD220E7841E00D85EF7 /* depthwise_conv_op.cpp in Sources */,
-				FC086DFA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */,
-				FC086E0A20E7841E00D85EF7 /* framework.pb-c.c in Sources */,
-				FC086DD020E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp in Sources */,
-				FC086DCB20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp in Sources */,
-				FC086DFC20E7841E00D85EF7 /* relu_kernel.cpp in Sources */,
-				FC086DE320E7841E00D85EF7 /* fusion_fc_op.cpp in Sources */,
-				FC086E0020E7841E00D85EF7 /* transpose_kernel.cpp in Sources */,
-				FC086DEE20E7841E00D85EF7 /* softmax_kernel.cpp in Sources */,
-				FC086DE420E7841E00D85EF7 /* batchnorm_op.cpp in Sources */,
-				FC086DED20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */,
-				FC086DF520E7841E00D85EF7 /* conv_kernel.cpp in Sources */,
-				FC086DD120E7841E00D85EF7 /* softmax_op.cpp in Sources */,
-				FC086E1020E7841E00D85EF7 /* op_desc.cpp in Sources */,
-				FC086DD420E7841E00D85EF7 /* gemm.cpp in Sources */,
-				FC086DFE20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		FC086BB620E7839B00D85EF7 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-			};
-			name = Debug;
-		};
-		FC086BB720E7839B00D85EF7 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		FC086BB920E7839B00D85EF7 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				HEADER_SEARCH_PATHS = ../../src;
-				OTHER_LDFLAGS = "-ObjC";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		FC086BBA20E7839B00D85EF7 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				HEADER_SEARCH_PATHS = ../../src;
-				OTHER_LDFLAGS = "-ObjC";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FC086BAA20E7839B00D85EF7 /* Build configuration list for PBXProject "PaddleMobile" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC086BB620E7839B00D85EF7 /* Debug */,
-				FC086BB720E7839B00D85EF7 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FC086BB820E7839B00D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobile" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC086BB920E7839B00D85EF7 /* Debug */,
-				FC086BBA20E7839B00D85EF7 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FC086BA720E7839B00D85EF7 /* Project object */;
-}
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:PaddleMobile.xcodeproj">
-   </FileRef>
-</Workspace>
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>SchemeUserState</key>
-	<dict>
-		<key>PaddleMobile.xcscheme</key>
-		<dict>
-			<key>orderHint</key>
-			<integer>1</integer>
-		</dict>
-	</dict>
-</dict>
-</plist>
--- a/ios/PaddleMobile/PaddleMobile/MacroDefine.h
+++ b/ios/PaddleMobile/PaddleMobile/MacroDefine.h
-//
-//  MacroDefine.h
-//  PaddleMobile
-//
-//  Created by liuRuiLong on 2018/6/30.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-#ifndef MacroDefine_h
-#define MacroDefine_h
-
-
-#endif /* MacroDefine_h */
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>IDEDidComputeMac32BitWarning</key>
-	<true/>
-</dict>
-</plist>
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import <UIKit/UIKit.h>
-
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
-
-@property (strong, nonatomic) UIWindow *window;
-
-
-@end
-
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import <UIKit/UIKit.h>
-
-@interface ViewController : UIViewController
-
-
-@end
-
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <cstdlib>
+
 #include "common/enforce.h"
 #include "common/log.h"


--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <cstdlib>
 #include <string>
 #include <typeinfo>
 #include <unordered_map>

--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <cctype>
+#include <cstdlib>
 #include <string>

 namespace paddle_mobile {

--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -14,9 +14,11 @@ limitations under the License. */

 #pragma once

+#include <cstdlib>
 #include <initializer_list>
 #include <typeinfo>
 #include <vector>
+
 #include "common/enforce.h"
 #include "common/variant.h"
 #include "dim.h"

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <cstdlib>
 #include "common/enforce.h"
 namespace paddle_mobile {
 namespace framework {

--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -34,6 +34,10 @@ class FusionOpRegister {
  }

  void regist(FusionOpMatcher* matcher) {
+    if (matchers_.find(matcher->Type()) != matchers_.end()) {
+      return;
+    }
+
    std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
    matchers_[matcher->Type()] = shared_matcher;
  }

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -152,7 +152,7 @@ class Tensor {
    if (holder_ != nullptr) {
      holder_->set_type(type);
    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
    int64_t size = numel() * SizeOfType(type);
    if (holder_ == nullptr || holder_->size() < size + offset_) {
      holder_.reset(new PlaceholderImpl(size, type));

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "io/executor.h"
+#include <operators/math/gemm.h>
 #include <algorithm>
 #include <vector>
 #include "common/enforce.h"
@@ -25,6 +26,9 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <queue>
 #include <utility>
@@ -348,16 +352,19 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
  fprintf(df, "}\n");
  fclose(df);
 #endif
-  FILE *pf = fopen("profile.out", "w");
+
+  //  FILE *pf = fopen("profile.out", "w");
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
-    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
-            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
+    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
+    //    ops[i]->Type().c_str(),
+    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
  }
-  fclose(pf);
+  //  fclose(pf);
+
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
@@ -400,6 +407,14 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
  return result_vector;
 }

+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::SetThreadNum(int num) {
+#ifdef _OPENMP
+  //  omp_set_dynamic(0);
+  omp_set_num_threads(num);
+#endif
+}
+
 template class Executor<CPU, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;

--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -58,6 +58,8 @@ class Executor {
  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);

+  void SetThreadNum(int num);
+
 protected:
  Executor() = default;
  void InitMemory();

--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -21,7 +21,6 @@ namespace paddle_mobile {
 using framework::Variable;

 static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  printf("%s \n", file_name);
  FILE *fp;
  fp = fopen(file_name, "rb");
  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -12,10 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-//
-// Created by liuRuiLong on 2018/7/2.
-//
-
 #include "io/paddle_mobile.h"

 namespace paddle_mobile {
@@ -78,6 +74,12 @@ void PaddleMobile<Dtype, P>::Clear() {
  loader_ = nullptr;
 }

+template <typename Dtype, Precision P>
+PaddleMobile<Dtype, P>::~PaddleMobile() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+
 template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -60,6 +60,8 @@ class PaddleMobile {

  void Clear();

+  ~PaddleMobile();
+
 private:
  std::shared_ptr<Loader<Dtype, P>> loader_;
  std::shared_ptr<Executor<Dtype, P>> executor_;

--- a/ios/PaddleMobile/PaddleMobile/PaddleMobile.m
+++ b/ios/PaddleMobile/PaddleMobile/PaddleMobile.m
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
- 
+
 http://www.apache.org/licenses/LICENSE-2.0
- 
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#import "PaddleMobile.h"
+#pragma once
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobile : NSObject
+
+/*
+    创建对象
+*/
+- (instancetype)init;

-@implementation PaddleMobile
+/*
+    load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;

-+ (instancetype)sharedInstance{
-  //TODO: imp
-  exit(0);
-}
+/*
+  加载散开形式的模型, 需传入模型的目录
+*/
+- (BOOL)load:(NSString *)modelAndWeightPath;

- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
-  //TODO: imp
-  exit(0);
-}
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;

- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale{
-  //TODO: imp
-  exit(0);
-}
+/*
+    进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;

- (NSArray *)predict:(CGImageRef)image{
-  //TODO: imp
-  exit(0);
-}
+/*
+    清理内存
+*/
+- (void)clear;

- (void)clear{
-  //TODO: imp
-  exit(0);
-}
 @end
--- a/src/ios_io/PaddleMobile.mm
+++ b/src/ios_io/PaddleMobile.mm
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import "PaddleMobile.h"
+#import "op_symbols.h"
+#import "io/paddle_mobile.h"
+
+#import <memory>
+#import <vector>
+
+@interface  PaddleMobile()
+{
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
+  BOOL loaded_;
+}
+@end
+
+@implementation PaddleMobile
+
+static std::mutex shared_mutex;
+
+- (instancetype)init {
+  if (self = [super init]) {
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>();
+  }
+  return self;
+}
+
+- (void)dealloc {
+  if (pam_) {
+    delete pam_;
+  }
+}
+
+ (instancetype)sharedInstance{
+  static dispatch_once_t onceToken;
+  static id sharedManager = nil;
+  dispatch_once(&onceToken, ^{
+    sharedManager = [[[self class] alloc] init];
+  });
+  return sharedManager;
+}
+
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
+  std::string model_path_str = std::string([modelPath UTF8String]);
+  std::string weights_path_str = std::string([weighsPath UTF8String]);
+  if (loaded_ = pam_->Load(model_path_str, weights_path_str, false)) {
+    return YES;
+  } else {
+    return NO;
+  }
+}
+
+- (BOOL)load:(NSString *)modelAndWeightPath{
+  std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
+  if (loaded_ = pam_->Load(model_path_str)) {
+    return YES;
+  } else {
+    return NO;
+  }
+}
+
+-(void)preprocess:(const UInt8 *)input output:(float *)output imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means scale:(float)scale dim:(std::vector<int64_t>)dim{
+  if (means == nil) {
+    means = @[@0, @0, @0];
+  }
+
+  int wanted_input_width = dim[3];
+  int wanted_input_height = dim[2];
+  int wanted_input_channels = dim[1];
+
+  for (int c = 0; c < wanted_input_channels; ++c) {
+    float *out_channel = output + c * wanted_input_height * wanted_input_width;
+    for (int y = 0; y < wanted_input_height; ++y) {
+      float *out_row = out_channel + y * wanted_input_width;
+      for (int x = 0; x < wanted_input_width; ++x) {
+        int in_row = (y * imageHeight) / wanted_input_height;
+        int in_col = (x * imageWidth) / wanted_input_width;
+        const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
+        float *out_pos = out_row + x;
+        if (c == 0) {
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 1){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 2){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }
+      }
+    }
+  }
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (!loaded_) {
+    printf("PaddleMobile doesn't be loaded yet");
+    return nil;
+  }
+
+  if (dim.count != 4) {
+    printf("dim must have 4 elements");
+    return nil;
+  }
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  const int sourceRowBytes = CGImageGetBytesPerRow(image);
+  const int image_width = CGImageGetWidth(image);
+  const int image_height = CGImageGetHeight(image);
+  const int image_channels = 4;
+  CGDataProviderRef provider = CGImageGetDataProvider(image);
+  CFDataRef cfData = CGDataProviderCopyData(provider);
+  const UInt8 *input = CFDataGetBytePtr(cfData);
+
+  // sample image
+  float *output = (float *)malloc(numel*sizeof(float));
+  [self preprocess:input output:output imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means scale:scale dim:dim_vec];
+  float *dataPointer = nullptr;
+  if (nullptr != output) {
+    dataPointer = output;
+  } else {
+    return nil;
+  }
+
+  // input
+  std::vector<float> predict_input;
+  for (int j = 0; j < numel; ++j) {
+    predict_input.push_back(dataPointer[j]);
+  }
+
+  // predict
+  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
+
+  // result
+  long count = 0;
+  count = cpp_result.size();
+  NSMutableArray *result = [[NSMutableArray alloc] init];
+  for (int i = 0; i < count; i++) {
+    [result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
+  }
+
+  free(output);
+
+  // 待验证
+  //  if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
+  CFRelease(cfData);
+  cfData = NULL;
+  //  }
+
+  return result;
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
+  [self predict:image dim:dim means:nil scale:1];
+}
+
+- (void)clear{
+  pam_->Clear();
+}
+
+@end
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
- 
+
 http://www.apache.org/licenses/LICENSE-2.0
- 
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#import "ViewController.h"
-
-@interface ViewController ()
-
-@end
-
-@implementation ViewController
-
- (void)viewDidLoad {
-    [super viewDidLoad];
-}
-
-
- (void)didReceiveMemoryWarning {
-    [super didReceiveMemoryWarning];
-}
-
-
-@end
+#pragma once
+
+#include "operators/batchnorm_op.h"
+#include "operators/box_coder_op.h"
+#include "operators/concat_op.h"
+#include "operators/conv_op.h"
+#include "operators/depthwise_conv_op.h"
+#include "operators/dropout_op.h"
+#include "operators/elementwise_add_op.h"
+#include "operators/feed_op.h"
+#include "operators/fetch_op.h"
+#include "operators/fusion_conv_add.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/fusion_fc_op.h"
+#include "operators/im2sequence_op.h"
+#include "operators/lrn_op.h"
+#include "operators/mul_op.h"
+#include "operators/multiclass_nms_op.h"
+#include "operators/pool_op.h"
+#include "operators/prior_box_op.h"
+#include "operators/relu_op.h"
+#include "operators/reshape_op.h"
+#include "operators/sigmoid_op.h"
+#include "operators/softmax_op.h"
+#include "operators/transpose_op.h"
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -60,8 +60,26 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                         optimize);
 }

-JNIEXPORT jfloatArray JNICALL
-Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf) {
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  ANDROIDLOGI("loadCombined invoked");
+  bool optimize = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         jstring2cppstring(env, paramPath),
+                                         optimize);
+}
+
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
+  ANDROIDLOGI("predictImage invoked");
+  jsize ddim_size = env->GetArrayLength(ddims);
+  if (ddim_size != 4) {
+    ANDROIDLOGE("ddims size not equal to 4");
+  }
+  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+  framework::DDim ddim = framework::make_ddim(
+      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+  int length = framework::product(ddim);
  jfloatArray result = NULL;
  int count = 0;
  float *dataPointer = nullptr;
@@ -69,17 +87,102 @@ Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf) {
    dataPointer = env->GetFloatArrayElements(buf, NULL);
  }
  framework::Tensor input;
-  framework::DDim ddim = framework::make_ddim({1, 3, 224, 224});
  input.Resize(ddim);
  auto input_ptr = input.mutable_data<float>();
-  for (int i = 0; i < framework::product(ddim); i++) {
+  for (int i = 0; i < length; i++) {
    input_ptr[i] = dataPointer[i];
  }
  auto output = shared_paddle_mobile_instance->Predict(input);
  count = output->numel();
  result = env->NewFloatArray(count);
  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-  ANDROIDLOGI("predict finished");
+  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+  ANDROIDLOGI("predictImage finished");
+  return result;
+}
+
+inline int yuv_to_rgb(int y, int u, int v, float *r, float *g, float *b) {
+  int r1 = (int)(y + 1.370705 * (v - 128));
+  int g1 = (int)(y - 0.698001 * (u - 128) - 0.703125 * (v - 128));
+  int b1 = (int)(y + 1.732446 * (u - 128));
+
+  r1 = (int)fminf(255, fmaxf(0, r1));
+  g1 = (int)fminf(255, fmaxf(0, g1));
+  b1 = (int)fminf(255, fmaxf(0, b1));
+  *r = r1;
+  *g = g1;
+  *b = b1;
+
+  return 0;
+}
+void convert_nv21_to_matrix(uint8_t *nv21, float *matrix, int width, int height,
+                            int targetWidth, int targetHeight, float *means) {
+  const uint8_t *yData = nv21;
+  const uint8_t *vuData = nv21 + width * height;
+
+  const int yRowStride = width;
+  const int vuRowStride = width;
+
+  float scale_x = width * 1.0 / targetWidth;
+  float scale_y = height * 1.0 / targetHeight;
+
+  for (int j = 0; j < targetHeight; ++j) {
+    int y = j * scale_y;
+    const uint8_t *pY = yData + y * yRowStride;
+    const uint8_t *pVU = vuData + (y >> 1) * vuRowStride;
+    for (int i = 0; i < targetWidth; ++i) {
+      int x = i * scale_x;
+      const int offset = ((x >> 1) << 1);
+      float r = 0;
+      float g = 0;
+      float b = 0;
+      yuv_to_rgb(pY[x], pVU[offset + 1], pVU[offset], &r, &g, &b);
+      int r_index = j * targetWidth + i;
+      int g_index = r_index + targetWidth * targetHeight;
+      int b_index = g_index + targetWidth * targetHeight;
+      matrix[r_index] = r - means[0];
+      matrix[g_index] = g - means[1];
+      matrix[b_index] = b - means[2];
+    }
+  }
+}
+
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
+    JNIEnv *env, jclass thiz, jbyteArray yuv_, jint imgwidth, jint imgHeight,
+    jintArray ddims, jfloatArray meanValues) {
+  ANDROIDLOGI("predictYuv invoked");
+  jsize ddim_size = env->GetArrayLength(ddims);
+  if (ddim_size != 4) {
+    ANDROIDLOGE("ddims size not equal to 4");
+  }
+  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+  framework::DDim ddim = framework::make_ddim(
+      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+  int length = framework::product(ddim);
+  float matrix[length];
+  jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
+  float *meansPointer = nullptr;
+  if (nullptr != meanValues) {
+    meansPointer = env->GetFloatArrayElements(meanValues, NULL);
+  }
+  convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth, imgHeight, ddim[3],
+                         ddim[2], meansPointer);
+  jfloatArray result = NULL;
+  int count = 0;
+  framework::Tensor input;
+  input.Resize(ddim);
+  auto input_ptr = input.mutable_data<float>();
+  for (int i = 0; i < length; i++) {
+    input_ptr[i] = matrix[i];
+  }
+  auto output = shared_paddle_mobile_instance->Predict(input);
+  count = output->numel();
+  result = env->NewFloatArray(count);
+  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  env->ReleaseByteArrayElements(yuv_, yuv, 0);
+  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+  env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
+  ANDROIDLOGI("predictYuv finished");
  return result;
 }


--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -22,11 +22,29 @@ extern "C" {
 namespace paddle_mobile {
 namespace jni {
 /**
- * load model & params of the net for android
+ * load separated model for android
 */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                          jclass thiz,
                                                          jstring modelPath);
+/**
+ * load combined model  for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+
+/**
+ * object detection for anroid
+ */
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims);
+
+/**
+ * object detection for anroid
+ */
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
+    JNIEnv *env, jclass thiz, jbyteArray yuv, jint imgwidth, jint imgHeight,
+    jintArray ddims, jfloatArray meanValues);

 /**
 * object detection for anroid

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -32,11 +32,9 @@ template class BatchNormOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(batch_norm);
 REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -45,4 +45,13 @@ class BatchNormOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -53,7 +53,6 @@ template class BoxCoderOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(box_coder);
 REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -51,4 +51,12 @@ class BoxCoderOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(box_coder);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -63,11 +63,9 @@ template class ConcatOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(concat);
 REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(concat);
 REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -46,4 +46,13 @@ class ConcatOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -55,15 +55,12 @@ template class ConvOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
 REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif


--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -46,4 +46,14 @@ class ConvOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+#endif
+
 #endif
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -56,7 +56,6 @@ template class DepthwiseConvOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(depthwise_conv2d);
 REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -48,4 +48,12 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(depthwise_conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -28,7 +28,6 @@ template class DropoutOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(dropout);
 REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -50,4 +50,12 @@ class DropoutOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(dropout);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -30,11 +30,9 @@ template class ElementwiseAddOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(elementwise_add);
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(elementwise_add);
 REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -48,4 +48,13 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -19,3 +19,14 @@ namespace operators {
 template class FeedOp<CPU, float>;
 }
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+
+#endif
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -44,17 +44,14 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
  FeedParam param_;
 };

-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(feed);
-REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 USE_OP_MALI_GPU(feed);
-REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -19,3 +19,13 @@ namespace operators {
 template class FetchOp<CPU, float>;
 }
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -44,17 +44,14 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
  FetchParam param_;
 };

-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(fetch);
-REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 USE_OP_MALI_GPU(fetch);
-REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -44,18 +44,17 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);
 }
+
 template class FusionConvAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add);
 REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv_add);
-REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
+REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -82,6 +82,7 @@ static framework::FusionOpRegistrar convadd_registrar(
 static framework::FusionOpRegistrar convadd_registrar(
    new FusionConvAddMatcher());
 #define CONV_ADD_REGISTER
+
 #endif

 #endif
@@ -92,4 +93,13 @@ static framework::FusionOpRegistrar convadd_registrar(
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_conv_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -50,7 +50,6 @@ template class FusionConvAddBNReluOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_bn_relu);
 REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -79,11 +79,11 @@ class FusionConvAddBNReluOp

 #ifdef PADDLE_MOBILE_CPU

-//#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-// static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-//    new FusionConvAddBNReluMatcher());
-//#define FUSION_CONV_ADD_BN_RELU_REGISTER
-//#endif
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif

 #endif

@@ -103,4 +103,12 @@ static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -49,7 +49,6 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_relu);
 REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -80,4 +80,12 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -49,18 +49,17 @@ void FusionFcOp<Dtype, T>::InferShape() const {
  framework::DDim ddim = framework::make_ddim(output_dims);
  this->param_.Out()->Resize(ddim);
 }
+
 template class FusionFcOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_fc);
 REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fc);
-REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
+REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -89,4 +89,13 @@ static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_fc);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_fc);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
@@ -54,7 +54,6 @@ template class Im2SequenceOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(im2sequence);
 REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -50,4 +50,12 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(im2sequence);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/kernel/arm/prelu_kernel.cpp
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#include "operators/kernel/prelu_kernel.h"
+#include <operators/math/transform.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct PReluFunctor {
+  explicit PReluFunctor(float slope) { this->slope_ = slope; }
+  inline T operator()(T in) const { return in > 0 ? in : in * slope_; }
+
+  float slope_ = 0.0f;
+};
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <>
+void PReluKernel<CPU, float>::Compute(const PReluParam &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  if (param.Slopes().size() == 1) {
+    PReluFunctor<float> func_(param.Slopes()[0]);
+    math::Transform trans;
+    trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+  } else if (param.Slopes().size() > 1) {
+    const int dim_size = input_x->dims().size();
+    switch (dim_size) {
+      case 0:
+        break;
+      case 1: {
+        const int input_width = input_x->dims()[0];
+        math::Transform trans;
+
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; ++w) {
+          out_ptr[w] = input_x_ptr[w] * param.Slopes()[w];
+        }
+      } break;
+      case 2: {
+        const int input_height = input_x->dims()[0];
+        const int input_width = input_x->dims()[1];
+
+        math::Transform trans;
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          PReluFunctor<float> func_(param.Slopes()[h]);
+          const float *ptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + +h * input_width;
+          trans(ptr, ptr + input_width, optr, func_);
+        }
+      } break;
+      case 3: {
+        const int chan_size = input_x->dims()[0];
+        const int input_height = input_x->dims()[1];
+        const int input_width = input_x->dims()[2];
+
+        math::Transform trans;
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          PReluFunctor<float> func_(param.Slopes()[c]);
+          int size = input_height * input_width;
+          const float *ptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          trans(ptr, ptr + size, optr, func_);
+        }
+      } break;
+      case 4:
+      default: {
+        const int batch_size = input_x->dims()[0];
+        const int chan_size = input_x->dims()[1];
+        const int input_height = input_x->dims()[2];
+        const int input_width = input_x->dims()[3];
+        math::Transform trans;
+
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            PReluFunctor<float> func_(param.Slopes()[c]);
+            int size = input_height * input_width;
+            const float *ptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + +b * c * size;
+            trans(ptr, ptr + size, optr, func_);
+          }
+        }
+      }  // case 3,default
+      break;
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/resize_kernel.cpp
+++ b/src/operators/kernel/arm/resize_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#include "operators/kernel/resize_kernel.h"
+#include <cmath>
+
+namespace paddle_mobile {
+namespace operators {
+void BiLinearResizeTensor(const float* src, const int src_height,
+                          const int src_width, float* dst, const int dst_height,
+                          const int dst_width) {
+  const float scale_w = src_width / (float)dst_width;
+  const float scale_h = src_height / (float)dst_height;
+  float* dst_data = dst;
+  const float* src_data = src;
+
+  for (int dst_h = 0; dst_h < dst_height; ++dst_h) {
+    float fh = dst_h * scale_h;
+
+    int src_h = std::floor(fh);
+
+    fh -= src_h;
+    const float w_h0 = std::abs((float)1.0 - fh);
+    const float w_h1 = std::abs(fh);
+
+    const int dst_offset_1 = dst_h * dst_width;
+    const int src_offset_1 = src_h * src_width;
+
+    float* dst_data_ptr = dst_data + dst_offset_1;
+
+    for (int dst_w = 0; dst_w < dst_width; ++dst_w) {
+      float fw = dst_w * scale_w;
+      int src_w = std::floor(fw);
+      fw -= src_w;
+      const float w_w0 = std::abs((float)1.0 - fw);
+      const float w_w1 = std::abs(fw);
+
+      float dst_value = 0;
+
+      const int src_idx = src_offset_1 + src_w;
+      dst_value += (w_h0 * w_w0 * src_data[src_idx]);
+      int flag = 0;
+      if (src_w + 1 < src_width) {
+        dst_value += (w_h0 * w_w1 * src_data[src_idx + 1]);
+        ++flag;
+      }
+      if (src_h + 1 < src_height) {
+        dst_value += (w_h1 * w_w0 * src_data[src_idx + src_width]);
+        ++flag;
+      }
+
+      if (flag > 1) {
+        dst_value += (w_h1 * w_w1 * src_data[src_idx + src_width + 1]);
+        //                ++flag;
+      }
+      *(dst_data_ptr++) = dst_value;
+    }
+  }
+}
+
+void ResizeTensor(const Tensor* src, const int src_n, const int src_c,
+                  Tensor* dst, const int dst_n, const int dst_c) {
+  framework::DDim in_dims = src->dims();
+  const int src_chans = in_dims[1];
+  const int src_height = in_dims[2];
+  const int src_width = in_dims[3];
+  const int src_offset = (src_n * src_chans + src_c) * src_height * src_width;
+
+  framework::DDim out_dims = dst->dims();
+  const int dst_chans = out_dims[1];
+  const int dst_height = out_dims[2];
+  const int dst_width = out_dims[3];
+  const int dst_offset = (dst_n * dst_chans + dst_c) * dst_height * dst_width;
+
+  const auto* src_ptr = src->data<float>();
+  auto* dst_ptr = dst->data<float>();
+  const auto* src_data = &(src_ptr[src_offset]);
+  auto* dst_data = &(dst_ptr[dst_offset]);
+  BiLinearResizeTensor(src_data, src_height, src_width, dst_data, dst_height,
+                       dst_width);
+}
+
+void ResizeTensor(const Tensor* src, Tensor* dst) {
+  framework::DDim in_dims = src->dims();
+  framework::DDim out_dims = dst->dims();
+  PADDLE_MOBILE_ENFORCE(in_dims[0] == out_dims[0],
+                        "src tensor batch num not equal to dst tensor");
+  PADDLE_MOBILE_ENFORCE(in_dims[1] == out_dims[1],
+                        "src tensor channel num not equal to dst tensor");
+  for (int n = 0, batch_num = in_dims[0]; n < batch_num; ++n) {
+    for (int c = 0, chan_num = in_dims[1]; c < chan_num; ++c) {
+      ResizeTensor(src, n, c, dst, n, c);
+    }
+  }
+}
+
+template <>
+void ResizeKernel<CPU, float>::Compute(const ResizeParam& param) const {
+  const auto* input_x = param.InputX();
+  const auto& input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  framework::DDim out_dims = CalOutputShape(param);
+
+  out->Resize(out_dims);
+  ResizeTensor(input_x, out);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#include "operators/kernel/scale_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <>
+void ScaleKernel<CPU, float>::Compute(const ScaleParam &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  const vector<float> scales = param.Scales();
+  bool has_bias = param.HasBias();
+
+  const int dim_size = input_x->dims().size();
+  switch (dim_size) {
+    case 1: {
+      const int input_width = input_x->dims()[0];
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; w++) {
+          out_ptr[w] = input_x_ptr[w] * scales[w] + biases[w];
+        }
+      } else {
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; w++) {
+          out_ptr[w] = input_x_ptr[w] * scales[w];
+        }
+      }
+    } break;
+    case 2: {
+      const int input_height = input_x->dims()[0];
+      const int input_width = input_x->dims()[1];
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          const float *iptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + h * input_width;
+          for (int w = 0; w < input_width; ++w) {
+            optr[w] = iptr[w] * scales[w] + biases[w];
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          const float *iptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + h * input_width;
+          for (int w = 0; w < input_width; ++w) {
+            optr[w] = iptr[w] * scales[w];
+          }
+        }
+      }
+    } break;
+    case 3: {
+      const int chan_size = input_x->dims()[0];
+      const int input_height = input_x->dims()[1];
+      const int input_width = input_x->dims()[2];
+      int size = input_width * input_height;
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          const float *iptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          for (int i = 0; i < size; ++i) {
+            optr[i] = iptr[i] * scales[c] + biases[c];
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          const float *iptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          for (int i = 0; i < size; ++i) {
+            optr[i] = iptr[i] * scales[c];
+          }
+        }
+      }
+    } break;
+
+    case 4: {
+      const int batch_size = input_x->dims()[0];
+      const int chan_size = input_x->dims()[0];
+      const int input_height = input_x->dims()[1];
+      const int input_width = input_x->dims()[2];
+      int size = input_width * input_height;
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            const float *iptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + b * c * size;
+            for (int i = 0; i < size; ++i) {
+              optr[i] = iptr[i] * scales[c] + biases[c];
+            }
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            const float *iptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + b * c * size;
+            for (int i = 0; i < size; ++i) {
+              optr[i] = iptr[i] * scales[c];
+            }
+          }
+        }
+      }
+    } break;
+    default:
+      break;
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "../sigmoid_kernel.h"
 #include "../central-arm-func/sigmoid_arm_func.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
 #include <cmath>

--- a/src/common/openmp-fix.cpp
+++ b/src/common/openmp-fix.cpp
@@ -12,16 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef PADDLE_MOBILE_USE_OPENMP
-/**
- * android-ndk-r17 has a problem when linking with openmp.
- * if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
- * after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
- * will not work. see test/common/test_openmp.cc the detailed reason is still
- * unclear, but this trick will work. a better solution is hacking the linker,
- * try some flags to make it link omp_* functions, but I didn't find out how to
- * make it work.
- */
-#include <omp.h>
-static int _ = omp_get_num_procs();
+#ifdef SLICE_OP
+
+#include "operators/kernel/slice_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
 #endif
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -53,6 +53,8 @@ void BatchnormCompute(const BatchNormParam &param) {
                        "C must equal to variance.numel()");

  int HXW = H * W;
+
+#ifdef ARMV7
  if (HXW > 32) {
    int NXC = N * C;
    float *inv_std_ptr = new float[NXC * 4];
@@ -227,6 +229,38 @@ void BatchnormCompute(const BatchNormParam &param) {

    delete[] inv_std_ptr;
  }
+#else
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#endif
 }

 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -124,9 +124,15 @@ void ConvAddCompute(const FusionConvAddParam &param) {
  } else if (param.Groups() == param.Input()->dims()[1] &&
             param.Input()->dims()[1] == param.Output()->dims()[1] &&
             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3) {
-    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                           param.Filter(), param.Bias(), param.Output(), true);
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
+    //        param.Paddings(),
+    //                               param.Filter(), param.Bias(),
+    //                               param.Output(), false);
+
+    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
+                                 *param.Bias(), true);
+
  } else {
    ConvAddBasic(param);
  }

--- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
@@ -26,8 +26,6 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
  Tensor bias = *param.Bias();
  Tensor new_bias = *param.NewBias();
  Tensor new_scale = *param.NewScale();
-  auto new_bias_ptr = new_bias.data<float>();
-  auto new_scale_ptr = new_scale.data<float>();
  int axis = param.Axis();
  Tensor *output = param.Output();
  math::expand_bias(bias, axis, output->dims());
@@ -106,20 +104,10 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(1));
-    }
-  }
-  /// todo : use neon in special case instead of 2for(300ms)
-  auto output_ptr = output->data<float>();
-  for (int c = 0; c < output_matrix_shape[0]; c++) {
-    int start = c * output_matrix_shape[1];
-    for (int j = 0; j < output_matrix_shape[1]; j++) {
-      output_ptr[start + j] =
-          output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c];
-      output_ptr[start + j] =
-          output_ptr[start + j] < 0 ? 0 : output_ptr[start + j];
+
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias);
    }
  }
 }
@@ -131,15 +119,19 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
      param.Input()->dims()[1] == param.Output()->dims()[1] &&
      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(
-        param.Input(), param.Filter(), param.Output(), &Bias, 1,
-        param.NewScale(), param.NewBias(), 1, 1);
-  } else if (0 && param.Groups() == param.Input()->dims()[1] &&
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), 1);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
             param.Input()->dims()[1] == param.Output()->dims()[1] &&
             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                           param.Filter(), &Bias, param.Output(), false);
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
  } else {
    ConvAddBNReluBasic(param);
  }

--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -37,8 +37,12 @@ void DepthwiseConvCompute(const ConvParam &param) {
             param.Input()->dims()[1] == param.Output()->dims()[1] &&
             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                           param.Filter(), &Bias, param.Output(), false);
+    //    math::DepthwiseConv3x3(param.Input(), param.Strides(),
+    //    param.Paddings(),
+    //                           param.Filter(), &Bias, param.Output(), false);
+    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
+                                 Bias, false);
+
  } else {
    ConvBasic(param);
  }

--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -76,12 +76,15 @@ void PoolCompute(const PoolParam &param) {
    }

  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+#ifndef IOS
    if (pooling_type == "max") {
      math::Pool2x2Max(strides, paddings, in_x, out);
    } else if (pooling_type == "avg") {
      math::Pool2x2Avg(strides, paddings, in_x, out);
    }
-
+#else
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#endif
  } else {
    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
  }

--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -14,8 +14,10 @@ limitations under the License. */
 #ifdef SIGMOID_OP
 #pragma once

+#include <cmath>
+
 #include "operators/op_param.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #include "operators/math/math_func_neon.h"
 #endif
@@ -24,7 +26,7 @@ namespace paddle_mobile {
 namespace operators {
 using framework::DDim;
 void sigmoid(const Tensor *X, Tensor *Y) {
-#if __ARM_NEON
+#ifdef __ARM_NEON
  const float *input = X->data<float>();
  float *output = Y->mutable_data<float>();
  const DDim &dDim = X->dims();

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #pragma once

 #include <vector>
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
 #include "common/common.h"

--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef LRN_OP
-
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/operator.h"
 #include "operators/op_param.h"

@@ -47,6 +49,7 @@ struct LRNFunctor {
    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);

    for (int a = 0; a < N; a++) {
+#pragma parallel for
      for (int b = 0; b < C; b++) {
        for (int index = start; index < end; index++) {
          int channel = b + index;

--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
@@ -225,6 +225,7 @@ class AclParameters {

  bool is_global_pool;
  bool is_channel_concat;
+  bool is_bypass;

  std::vector<framework::LoDTensor *> in_tensor;
 };

--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -71,6 +71,7 @@ class AclBatchNormOp : public acl::ACLOperator {
  bool Bypass_acl(const BatchNormParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_) {
      bypass_acl = true;
@@ -135,6 +136,10 @@ bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam* param) {
    acl_op = new AclBatchNormOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -147,15 +152,8 @@ void BatchNormKernel<GPU_MALI, float>::Compute(
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+  acl_op->RunAcl(args.input_data, args.output_data);
 }

 template class BatchNormKernel<GPU_MALI, float>;

--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
@@ -50,8 +50,6 @@ class AclConcatOp : public acl::ACLOperator {
    T type;

    for (int i = 0; i < input_data->size(); i++) {
-      const T* idata = (*input_data)[i]->data<T>();
-      const T* pdata = (*input_data)[i]->data<T>();
      int in_batch = (*input_data)[i]->dims()[0];
      int in_channels = (*input_data)[i]->dims()[1];
      int in_width = (*input_data)[i]->dims()[2];
@@ -75,6 +73,7 @@ class AclConcatOp : public acl::ACLOperator {
  bool Bypass_acl(const ConcatParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
      bypass_acl = true;
@@ -103,13 +102,17 @@ class AclConcatOp : public acl::ACLOperator {
 };

 template <>
-bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
+bool ConcatKernel<GPU_MALI, float>::Init(ConcatParam* param) {
  AclConcatOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
  if (acl_op == nullptr) {
    acl_op = new AclConcatOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -121,15 +124,8 @@ void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
  acl::AclParameters& args = acl_op->getargs();
-  std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl(temp_data, (void*)output_data);
+  acl_op->RunAcl(args.in_tensor, args.output_data);
 }

 template class ConcatKernel<GPU_MALI, float>;

--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
@@ -55,7 +55,8 @@ class AclConvAddOp : public acl::ACLOperator {
    set_operator_init_done();
    this->force_bypass_acl_path_ = false;

-    check_direct_conv();
+    // check_direct_conv();
+    group() = args.num_group;
    //[kernel_x, kernel_y, IFM, OFM]
    new_tensor(weights(), weights_shape, args.weight_data);
    //[OFM]
@@ -63,8 +64,6 @@ class AclConvAddOp : public acl::ACLOperator {
      new_tensor(biases(), biases_shape, args.biases_data);
    }

-    group() = args.num_group;
-
    //[width, height, IFM]
    new_tensor(input(), input_shape, args.input_data);
    //[width, height, OFM]
@@ -79,6 +78,7 @@ class AclConvAddOp : public acl::ACLOperator {
  bool Bypass_acl(const FusionConvAddParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
      bypass_acl = true;
@@ -196,14 +196,17 @@ class AclConvAddOp : public acl::ACLOperator {
 };

 template <>
-bool ConvAddKernel<GPU_MALI, float>::Init(
-    const FusionConvAddParam& param) const {
+bool ConvAddKernel<GPU_MALI, float>::Init(FusionConvAddParam* param) {
  AclConvAddOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
  if (acl_op == nullptr) {
    acl_op = new AclConvAddOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -216,15 +219,9 @@ void ConvAddKernel<GPU_MALI, float>::Compute(
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+
+  acl_op->RunAcl(args.input_data, args.output_data);
 }

 template class ConvAddKernel<GPU_MALI, float>;

--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -79,6 +79,7 @@ class AclConvOp : public acl::ACLOperator {
  bool Bypass_acl(const ConvParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
      bypass_acl = true;
@@ -202,6 +203,10 @@ bool ConvKernel<GPU_MALI, float>::Init(ConvParam* param) {
    acl_op = new AclConvOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -213,15 +218,8 @@ void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+  acl_op->RunAcl(args.input_data, args.output_data);
 }

 template class ConvKernel<GPU_MALI, float>;

--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
@@ -20,6 +20,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_MALI_GPU
 #include "acl_operator.h"
 #include "framework/operator.h"
+#include "operators/kernel/central-arm-func/lrn_arm_func.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
@@ -59,12 +60,15 @@ class AclLrnOp : public acl::ACLOperator {
    acl_configure(lrn, this, norm_info);
  }

+  void Set_bypass(bool bypass) { args.is_bypass = bypass; }
+
  void RunAcl(void* input, void* output) {
    acl::ACLOperator::acl_run(input, output);
  }
  bool Bypass_acl(const LrnParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_) {
      bypass_acl = true;
@@ -107,13 +111,18 @@ class AclLrnOp : public acl::ACLOperator {
 };

 template <>
-bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
+bool LrnKernel<GPU_MALI, float>::Init(LrnParam* param) {
  AclLrnOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
  if (acl_op == nullptr) {
    acl_op = new AclLrnOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    acl_op->Set_bypass(true);
+    std::cout << "init acl failed" << std::endl;
+    return true;
+  }
  return true;
 }

@@ -125,14 +134,14 @@ void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
+  acl::AclParameters& args = acl_op->getargs();
+  if (args.is_bypass) {
+    std::cout << "bypass op" << std::endl;
+    LrnCompute<float>(param);
    return;
  }
-  acl::AclParameters& args = acl_op->getargs();
  const float* input_data = (const float*)args.input_data;
  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
  for (int n = 0; n < args.batch; ++n) {
    acl_op->RunAcl((void*)input_data, (void*)output_data);
    input_data += args.in_depth * args.in_cols * args.in_rows;

--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
@@ -82,6 +82,7 @@ class AclPoolOp : public acl::ACLOperator {
  bool Bypass_acl(const PoolParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_) {
      bypass_acl = true;
@@ -179,13 +180,17 @@ class AclPoolOp : public acl::ACLOperator {
 };

 template <>
-bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
+bool PoolKernel<GPU_MALI, float>::Init(PoolParam* param) {
  AclPoolOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
  if (acl_op == nullptr) {
    acl_op = new AclPoolOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -197,14 +202,9 @@ void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
  acl::AclParameters& args = acl_op->getargs();
  const float* input_data = (const float*)args.input_data;
  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
  for (int n = 0; n < args.batch; ++n) {
    acl_op->RunAcl((void*)input_data, (void*)output_data);
    input_data += args.in_depth * args.in_cols * args.in_rows;

--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
@@ -41,10 +41,10 @@ class AclReluOp : public acl::ACLOperator {
  acl::AclParameters& getargs() { return args; }
  void InitAclLayer(const ReluParam& param) {
    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
-                                         args.in_depth * args.batch);
-    arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
-                                          args.in_depth * args.out_num);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols, args.in_rows,
+                                          args.in_depth, args.out_num);
    // arm_compute::TensorShape weights_shape(
    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
    // arm_compute::TensorShape biases_shape(args.out_depth);
@@ -71,6 +71,7 @@ class AclReluOp : public acl::ACLOperator {
  bool Bypass_acl(const ReluParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_) {
      bypass_acl = true;
@@ -99,13 +100,17 @@ class AclReluOp : public acl::ACLOperator {
 };

 template <>
-bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
+bool ReluKernel<GPU_MALI, float>::Init(ReluParam* param) {
  AclReluOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
  if (acl_op == nullptr) {
    acl_op = new AclReluOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -117,15 +122,8 @@ void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+  acl_op->RunAcl(args.input_data, args.output_data);
 }

 template class ReluKernel<GPU_MALI, float>;

--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
@@ -61,6 +61,7 @@ class AclSoftmaxOp : public acl::ACLOperator {
  bool Bypass_acl(const SoftmaxParam& param) {
    bool bypass_acl = false;
    AclParametersByContext(param);
+    InitAclLayer(param);
    // for performance, more groups impact GPU performance
    if (this->force_bypass_acl_path_) {
      bypass_acl = true;
@@ -96,13 +97,17 @@ class AclSoftmaxOp : public acl::ACLOperator {
 };

 template <>
-bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
+bool SoftmaxKernel<GPU_MALI, float>::Init(SoftmaxParam* param) {
  AclSoftmaxOp<GPU_MALI, float>* acl_op =
      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
  if (acl_op == nullptr) {
    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
    this->SetAclOp((void*)acl_op, (void*)this);
  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
  return true;
 }

@@ -114,14 +119,10 @@ void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
  if (acl_op == nullptr) {
    return;
  }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
  acl::AclParameters& args = acl_op->getargs();
  const float* input_data = (const float*)args.input_data;
  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
+
  for (int n = 0; n < args.out_num; ++n) {
    acl_op->RunAcl((void*)input_data, (void*)output_data);
    input_data += args.in_depth;

--- a/src/operators/kernel/prelu_kernel.h
+++ b/src/operators/kernel/prelu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class PReluKernel : public framework::OpKernelBase<DeviceType, PReluParam> {
+ public:
+  void Compute(const PReluParam& param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/resize_kernel.h
+++ b/src/operators/kernel/resize_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#pragma once
+
+#include <vector>
+#include "framework/operator.h"
+
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+inline framework::DDim CalOutputShape(const ResizeParam &param) {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    const int in_batch_size = input_x->dims()[0];
+    const int in_chan_size = input_x->dims()[1];
+    const int in_height = input_x->dims()[2];
+    const int in_width = input_x->dims()[3];
+
+    int out_height = 0;
+    int out_width = 0;
+    bool is_pyramid_test = param.IsPyramidTest();
+    if (is_pyramid_test == false) {
+      out_height = param.Height();
+      out_width = param.Width();
+      PADDLE_MOBILE_ENFORCE(out_height > 0, "output height is required");
+      PADDLE_MOBILE_ENFORCE(out_width > 0, "output width is required");
+
+    } else {
+      float out_height_scale = param.OutHeightScale();
+      float out_width_scale = param.OutWidthScale();
+      PADDLE_MOBILE_ENFORCE(out_height_scale > 0,
+                            "output height scale is required");
+      PADDLE_MOBILE_ENFORCE(out_width_scale > 0,
+                            "output width scale is required");
+
+      out_height = int(out_height_scale * in_height);
+      out_width = int(out_width_scale * in_width);
+    }
+
+    out_dims = framework::make_ddim(
+        {in_batch_size, in_chan_size, in_height, in_width});
+  }
+  return out_dims;
+}
+
+template <typename DeviceType, typename T>
+class ResizeKernel : public framework::OpKernelBase<DeviceType, ResizeParam> {
+ public:
+  void Compute(const ResizeParam &param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/scale_kernel.h
+++ b/src/operators/kernel/scale_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ScaleKernel : public framework::OpKernelBase<DeviceType, ScaleParam> {
+ public:
+  void Compute(const ScaleParam& param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class SliceKernel : public framework::OpKernelBase<DeviceType, SliceParam> {
+ public:
+  void Compute(const SliceParam& param) const {}
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -30,11 +30,9 @@ template class LrnOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lrn);
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(lrn);
 REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -46,4 +46,13 @@ class LrnOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif

@@ -49,7 +49,7 @@ inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
  auto new_ptr = bias.mutable_data<float>();
  int axis_size = dDim[axis];

-#if __ARM_NEON
+#ifdef __ARM_NEON
  for (int i = 0; i < outer_size; ++i) {
    int inner_num = inner_size >> 4;
    int remain = inner_size - (inner_num << 4);

--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/math/depthwise_conv_3x3.h"
+#ifdef __ARM_NEON
 #include <arm_neon.h>
+#endif
 #include <vector>

 namespace paddle_mobile {
@@ -21,7 +23,7 @@ namespace math {
 void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                      vector<int> paddings, const Tensor *filter, Tensor *bias,
                      Tensor *output, bool if_bias) {
-#if __ARM_NEON
+#ifdef __ARM_NEON
  const int batch_size = input->dims()[0];

  const int input_height = input->dims()[2];
@@ -242,6 +244,7 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,

 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                          Tensor *output, Tensor *bias, bool if_bias) {
+#ifdef __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
@@ -511,17 +514,16 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
      filter_data_tmp += 9;
    }
  }
+#endif
 }

 void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
-                                   Tensor *output, Tensor *bias, bool if_bias,
-                                   const Tensor *new_scale,
-                                   const Tensor *new_bias, bool if_bn,
-                                   bool if_relu) {
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu) {
+#ifdef __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
-  const float *bias_data = bias->data<float>();
  const float *newscale_data = new_scale->data<float>();
  const float *newbias_data = new_bias->data<float>();

@@ -532,7 +534,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
  const int hxw = h * w;
-  float32x4_t vbias = vdupq_n_f32(0.0);
  float32x4_t vnewbias = vdupq_n_f32(0.0);
  float32x4_t vnewscale = vdupq_n_f32(1.0);
  float32x4_t vzero = vdupq_n_f32(0);
@@ -541,13 +542,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
    const float *filter_data_tmp = filter_data;

    for (int j = 0; j < c; ++j) {
-      if (if_bias) {
-        vbias = vdupq_n_f32(bias_data[j]);
-      }
-      if (if_bn) {
-        vnewbias = vdupq_n_f32(newbias_data[j]);
-        vnewscale = vdupq_n_f32(newscale_data[j]);
-      }
+      vnewbias = vdupq_n_f32(newbias_data[j]);
+      vnewscale = vdupq_n_f32(newscale_data[j]);
+
      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
      float w00 = filter_data_tmp[0];
      float w01 = filter_data_tmp[1];
@@ -573,21 +570,14 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                               w01 * input_data[(l - 2) * (l + 1) + 1] +
                               w10 * input_data[l * l - 2] +
                               w11 * input_data[l * l - 1];
-      if (if_bias) {
-        output_data[0] += bias_data[j];
-        output_data[l - 1] += bias_data[j];
-        output_data[(l - 1) * l] += bias_data[j];
-        output_data[l * l - 1] += bias_data[j];
-      }
-      if (if_bn) {
-        output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j];
-        output_data[l - 1] =
-            output_data[l - 1] * newscale_data[j] + newbias_data[j];
-        output_data[(l - 1) * l] =
-            output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
-        output_data[l * l - 1] =
-            output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
-      }
+      output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j];
+      output_data[l - 1] =
+          output_data[l - 1] * newscale_data[j] + newbias_data[j];
+      output_data[(l - 1) * l] =
+          output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
+      output_data[l * l - 1] =
+          output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
+
      if (if_relu) {
        output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
        output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
@@ -607,16 +597,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                                     w11 * input_data[i * l + l - 1] +
                                     w20 * input_data[i * l + l - 1 + l - 1] +
                                     w21 * input_data[i * l + l - 1 + l];
-        if (if_bias) {
-          output_data[i * l] += bias_data[j];
-          output_data[i * l + l - 1] += bias_data[j];
-        }
-        if (if_bn) {
-          output_data[i * l] =
-              output_data[i * l] * newscale_data[j] + newbias_data[j];
-          output_data[i * l + l - 1] =
-              output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j];
-        }
+        output_data[i * l] =
+            output_data[i * l] * newscale_data[j] + newbias_data[j];
+        output_data[i * l + l - 1] =
+            output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j];
+
        if (if_relu) {
          output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
          output_data[i * l + l - 1] =
@@ -652,7 +637,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
        out0 = vmlaq_n_f32(out0, in2, w20);
        out0 = vmlaq_n_f32(out0, tmp2, w21);
        out0 = vmlaq_n_f32(out0, tmp3, w22);
-        out0 = vaddq_f32(out0, vbias);
        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
        if (if_relu) {
          out0 = vmaxq_f32(out0, vzero);
@@ -673,7 +657,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
        out0 = vmlaq_n_f32(out0, in6, w10);
        out0 = vmlaq_n_f32(out0, tmp2, w11);
        out0 = vmlaq_n_f32(out0, tmp3, w12);
-        out0 = vaddq_f32(out0, vbias);
        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
        if (if_relu) {
          out0 = vmaxq_f32(out0, vzero);
@@ -705,7 +688,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
      out0 = vmlaq_n_f32(out0, in2, w20);
      out0 = vmlaq_n_f32(out0, tmp2, w21);
      out0 = vmlaq_n_f32(out0, tmp3, w22);
-      out0 = vaddq_f32(out0, vbias);
      out0 = vmlaq_f32(vnewbias, vnewscale, out0);
      if (if_relu) {
        out0 = vmaxq_f32(out0, vzero);
@@ -737,7 +719,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
      out0 = vmlaq_n_f32(out0, in6, w10);
      out0 = vmlaq_n_f32(out0, tmp2, w11);
      out0 = vmlaq_n_f32(out0, tmp3, w12);
-      out0 = vaddq_f32(out0, vbias);
      out0 = vmlaq_f32(vnewbias, vnewscale, out0);
      if (if_relu) {
        out0 = vmaxq_f32(out0, vzero);
@@ -783,7 +764,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
          out0 = vmlaq_n_f32(out0, tmp4, w21);
          out0 = vmlaq_n_f32(out0, tmp5, w22);
-          out0 = vaddq_f32(out0, vbias);
          out0 = vmlaq_f32(vnewbias, vnewscale, out0);
          if (if_relu) {
            out0 = vmaxq_f32(out0, vzero);
@@ -817,7 +797,6 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
        out0 = vmlaq_n_f32(out0, tmp4, w21);
        out0 = vmlaq_n_f32(out0, tmp5, w22);
-        out0 = vaddq_f32(out0, vbias);
        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
        if (if_relu) {
          out0 = vmaxq_f32(out0, vzero);
@@ -839,7 +818,647 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
      filter_data_tmp += 9;
    }
  }
+#endif
 }
+
+void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu) {
+#ifdef __ARM_NEON
+
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = 2;
+  const int stride_width = 2;
+  const int padding_height = 1;
+  const int padding_width = 1;
+  const float zero = 0;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int filter_channel_stride = 9;
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+
+  float *output_data = output->mutable_data<float>();
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const int filter_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  float result;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      filter1 = filter_data;
+      filter2 = filter1 + 3;
+      filter3 = filter2 + 3;
+
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            result = 0;
+            float fake_input[9] = {0};
+            if (hstart == 0 && wstart == 0) {
+              // 左上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k -
+                                   (3 - wend)];
+                  }
+                }
+              }
+            } else if (hstart == 0 && wend == input_width) {
+              // 右上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height && wstart == 0) {
+              // 左下角
+
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - 1 - hstart && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k - (3 - wend)];
+                  }
+                }
+              }
+            } else if (hend == input_height && wend == input_width) {
+              // 右下角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1 &&
+                      k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hstart == 0) {
+              // 顶部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height) {
+              // 底部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (wstart == 0) {
+              // 左侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width +
+                                   (k - (3 - wend))];
+                  }
+                }
+              }
+
+            } else if (wend == input_width) {
+              // 右侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            }
+            for (int l = 0; l < 9; ++l) {
+              result += fake_input[l] * filter1[l];
+            }
+            output_data[ph * output_width + pw] =
+                newscale_data[c] * result + newbias_data[c];
+
+            if (if_relu) {
+              output_data[ph * output_width + pw] =
+                  output_data[ph * output_width + pw] < 0
+                      ? 0
+                      : output_data[ph * output_width + pw];
+            }
+          } else {
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            output_data[ph * output_width + pw] =
+                vget_lane_f32(res, 0) * newscale_data[c] + newbias_data[c];
+
+            if (if_relu) {
+              output_data[ph * output_width + pw] =
+                  output_data[ph * output_width + pw] < 0
+                      ? 0
+                      : output_data[ph * output_width + pw];
+            }
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+      filter_data += filter_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                            Tensor *output, Tensor bias, bool if_bias) {
+#ifdef __ARM_NEON
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data = bias.data<float>();
+
+  const int in_h = static_cast<int>(input->dims()[2]);
+  const int in_w = static_cast<int>(input->dims()[3]);
+  const int out_h = static_cast<int>(output->dims()[2]);
+  const int out_w = static_cast<int>(output->dims()[3]);
+  const int out_l = out_h;
+  const int in_l = in_h;
+  const int inhxw = in_h * in_w;
+  const int outhxw = out_h * out_w;
+  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const float *input_row_ptr;
+  float *output_row_ptr;
+
+  const int w_times = (out_w - 2) / 3;
+
+  float32x4_t vbias = vdupq_n_f32(0.0);
+
+  float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1];
+  float32x4_t elewise_res0, elewise_res1, elewise_res2, res3;
+  int out2in_mid;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = batch_size; b > 0; --b) {
+    const float *filter_data_tmp = filter_data;
+    for (int j = 0; j < c; ++j) {
+      auto output_data_tmp = output_data + j * out_h * out_w;
+      auto input_data_tmp = input_data + j * in_h * in_w;
+      auto input_const = input_data_tmp;
+
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      int h_mid = 0;
+
+      for (; h_mid < out_h - 1; h_mid++) {
+        input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+        output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+        for (int w4 = 0; w4 < w_times + 1; w4++) {
+          if (h_mid == 0) {
+            elewise_res1 = zero;
+            elewise_res0 = zero;
+            elewise_res2 = zero;
+          } else {
+            elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+            elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+            elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+          }
+          input_buff_mid = vld2q_f32(input_row_ptr);
+          input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+          elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+          elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+          elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+
+          res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                           vaddq_f32(elewise_res0, elewise_res1));
+          res3 = vaddq_f32(res3, vbias);
+          vst1q_f32(output_row_ptr, res3);
+
+          input_row_ptr += 6;
+          output_row_ptr += 3;
+        }
+      }
+      clock();
+
+      input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+      output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+      for (int w4 = 0; w4 < w_times + 1; w4++) {
+        elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+        elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+        elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+
+        input_buff_mid = vld2q_f32(input_row_ptr);
+        input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+        elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+        if (!if_pad) {
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+        }
+        res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                         vaddq_f32(elewise_res0, elewise_res1));
+        res3 = vaddq_f32(res3, vbias);
+
+        if ((w4 != w_times)) {
+          vst1q_f32(output_row_ptr, res3);
+        } else {
+          if (out_l - 2 - w_times * 3 == 1) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+          } else if (out_l - 2 - w_times * 3 == 2) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
+          }
+        }
+        input_row_ptr += 6;
+        output_row_ptr += 3;
+      }
+
+      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
+                           input_const[in_l] * w21 +
+                           input_const[in_l + 1] * w22;
+
+      out2in_mid = (out_l - 1) * 2;
+      output_data_tmp[out_l - 1] =
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          w20 * input_const[out2in_mid + in_w - 1] +
+          w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+
+      out2in_mid = (out_l - 1) * 2 * in_w;
+
+      output_data_tmp[out_l * (out_l - 1)] =
+          w01 * input_const[out2in_mid - in_w] +
+          w02 * input_const[out2in_mid - in_w + 1] +
+          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+
+      output_data_tmp[out_l * out_l - 1] =
+          w00 * input_const[out2in_mid - in_w - 1] +
+          w01 * input_const[out2in_mid - in_w] +
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+                          w21 * input_const[out2in_mid + in_w] +
+                          w02 * input_const[out2in_mid - in_w + 1] +
+                          w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      if (if_bias) {
+        output_data_tmp[0] += bias_data[j];
+        output_data_tmp[out_l - 1] += bias_data[j];
+        output_data_tmp[out_l * (out_l - 1)] += bias_data[j];
+        output_data_tmp[out_l * out_l - 1] += bias_data[j];
+      }
+      for (int i = 1; i < out_h - 1; i++) {
+        out2in_mid = i * 2 * in_w;
+        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+                                     w02 * input_const[out2in_mid - in_w + 1] +
+                                     w11 * input_const[out2in_mid] +
+                                     w12 * input_const[out2in_mid + 1] +
+                                     w21 * input_const[out2in_mid + in_w] +
+                                     w22 * input_const[out2in_mid + in_w + 1];
+
+        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        output_data_tmp[i * out_l + out_l - 1] =
+            w00 * input_const[out2in_mid - in_w - 1] +
+            w01 * input_const[out2in_mid - in_w] +
+            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+            w20 * input_const[out2in_mid + in_w - 1] +
+            w21 * input_const[out2in_mid + in_w] +
+            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+        if (if_bias) {
+          output_data_tmp[i * out_l] += bias_data[j];
+          output_data_tmp[i * out_l + out_l - 1] += bias_data[j];
+        }
+      }
+      filter_data_tmp += 9;
+    }
+    input_data += inhxw * c;
+    output_data += outhxw * c;
+  }
+#endif
+}
+
+void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                                     Tensor *output, const Tensor *new_scale,
+                                     const Tensor *new_bias, bool if_relu) {
+#ifdef __ARM_NEON
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  float32x4_t vnewbias = vdupq_n_f32(0.0);
+  float32x4_t vnewscale = vdupq_n_f32(1.0);
+
+  const int in_h = static_cast<int>(input->dims()[2]);
+  const int in_w = static_cast<int>(input->dims()[3]);
+  const int out_h = static_cast<int>(output->dims()[2]);
+  const int out_w = static_cast<int>(output->dims()[3]);
+  const int out_l = out_h;
+  const int in_l = in_h;
+  const int inhxw = in_h * in_w;
+  const int outhxw = out_h * out_w;
+  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const float *input_row_ptr;
+  float *output_row_ptr;
+
+  const int w_times = (out_w - 2) / 3;
+
+  float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1];
+  float32x4_t elewise_res0, elewise_res1, elewise_res2, res3;
+  int out2in_mid;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = batch_size; b > 0; --b) {
+    const float *filter_data_tmp = filter_data;
+    for (int j = 0; j < c; ++j) {
+      auto output_data_tmp = output_data + j * out_h * out_w;
+      auto input_data_tmp = input_data + j * in_h * in_w;
+      auto input_const = input_data_tmp;
+
+      vnewbias = vdupq_n_f32(newbias_data[j]);
+      vnewscale = vdupq_n_f32(newscale_data[j]);
+
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      int h_mid = 0;
+
+      for (; h_mid < out_h - 1; h_mid++) {
+        input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+        output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+        for (int w4 = 0; w4 < w_times + 1; w4++) {
+          if (h_mid == 0) {
+            elewise_res1 = zero;
+            elewise_res0 = zero;
+            elewise_res2 = zero;
+          } else {
+            elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+            elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+            elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+          }
+          input_buff_mid = vld2q_f32(input_row_ptr);
+          input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+          elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+          elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+          elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+
+          res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                           vaddq_f32(elewise_res0, elewise_res1));
+          res3 = vmlaq_f32(vnewbias, vnewscale, res3);
+
+          if (if_relu) {
+            res3 = vmaxq_f32(res3, zero);
+          }
+          vst1q_f32(output_row_ptr, res3);
+
+          input_row_ptr += 6;
+          output_row_ptr += 3;
+        }
+      }
+      clock();
+
+      input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+      output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+      for (int w4 = 0; w4 < w_times + 1; w4++) {
+        elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+        elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+        elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+
+        input_buff_mid = vld2q_f32(input_row_ptr);
+        input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+        elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+        if (!if_pad) {
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+        }
+        res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                         vaddq_f32(elewise_res0, elewise_res1));
+        res3 = vmlaq_f32(vnewbias, vnewscale, res3);
+
+        if (if_relu) {
+          res3 = vmaxq_f32(res3, zero);
+        }
+        if ((w4 != w_times)) {
+          vst1q_f32(output_row_ptr, res3);
+        } else {
+          if (out_l - 2 - w_times * 3 == 1) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+          } else if (out_l - 2 - w_times * 3 == 2) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
+          }
+        }
+        input_row_ptr += 6;
+        output_row_ptr += 3;
+      }
+
+      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
+                           input_const[in_l] * w21 +
+                           input_const[in_l + 1] * w22;
+
+      out2in_mid = (out_l - 1) * 2;
+      output_data_tmp[out_l - 1] =
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          w20 * input_const[out2in_mid + in_w - 1] +
+          w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+
+      out2in_mid = (out_l - 1) * 2 * in_w;
+
+      output_data_tmp[out_l * (out_l - 1)] =
+          w01 * input_const[out2in_mid - in_w] +
+          w02 * input_const[out2in_mid - in_w + 1] +
+          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+
+      output_data_tmp[out_l * out_l - 1] =
+          w00 * input_const[out2in_mid - in_w - 1] +
+          w01 * input_const[out2in_mid - in_w] +
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+                          w21 * input_const[out2in_mid + in_w] +
+                          w02 * input_const[out2in_mid - in_w + 1] +
+                          w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      output_data_tmp[0] =
+          output_data_tmp[0] * newscale_data[j] + newbias_data[j];
+      output_data_tmp[out_l - 1] =
+          output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j];
+      output_data_tmp[out_l * (out_l - 1)] =
+          output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] +
+          newbias_data[j];
+      output_data_tmp[out_l * out_l - 1] =
+          output_data_tmp[out_l * out_l - 1] * newscale_data[j] +
+          newbias_data[j];
+      if (if_relu) {
+        output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0];
+        output_data_tmp[out_l - 1] =
+            output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1];
+        output_data_tmp[out_l * (out_l - 1)] =
+            output_data_tmp[out_l * (out_l - 1)] < 0
+                ? 0
+                : output_data_tmp[out_l * (out_l - 1)];
+        output_data_tmp[out_l * out_l - 1] =
+            output_data_tmp[out_l * out_l - 1] < 0
+                ? 0
+                : output_data_tmp[out_l * out_l - 1];
+      }
+      for (int i = 1; i < out_h - 1; i++) {
+        out2in_mid = i * 2 * in_w;
+        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+                                     w02 * input_const[out2in_mid - in_w + 1] +
+                                     w11 * input_const[out2in_mid] +
+                                     w12 * input_const[out2in_mid + 1] +
+                                     w21 * input_const[out2in_mid + in_w] +
+                                     w22 * input_const[out2in_mid + in_w + 1];
+
+        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        output_data_tmp[i * out_l + out_l - 1] =
+            w00 * input_const[out2in_mid - in_w - 1] +
+            w01 * input_const[out2in_mid - in_w] +
+            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+            w20 * input_const[out2in_mid + in_w - 1] +
+            w21 * input_const[out2in_mid + in_w] +
+            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+        output_data_tmp[i * out_l] =
+            output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j];
+        output_data_tmp[i * out_l + out_l - 1] =
+            output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] +
+            newbias_data[j];
+        if (if_relu) {
+          output_data_tmp[i * out_l] =
+              output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l];
+          output_data_tmp[i * out_l + out_l - 1] =
+              output_data_tmp[i * out_l + out_l - 1] < 0
+                  ? 0
+                  : output_data_tmp[i * out_l + out_l - 1];
+        }
+      }
+      filter_data_tmp += 9;
+    }
+    input_data += inhxw * c;
+    output_data += outhxw * c;
+  }
+#endif
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
@@ -33,10 +33,16 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                          Tensor *output, Tensor *bias, bool if_bias);
 void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
-                                   Tensor *output, Tensor *bias, bool if_bias,
-                                   const Tensor *new_scale,
-                                   const Tensor *new_bias, bool if_bn,
-                                   bool if_relu);
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu);
+void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu);
+void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                            Tensor *output, Tensor bias, bool if_bias);
+void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                                     Tensor *output, const Tensor *new_scale,
+                                     const Tensor *new_bias, bool if_relu);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -22,9 +22,14 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-alignas(64) float packedA[MC * KC];
-alignas(64) float packedB[KC * NC];
-alignas(64) float ab[MR * NR];
+int MC = 0;
+int KC = 0;
+int NC = 0;
+
+float *packedA;
+float *packedB;
+float *packedC;
+float *zero;
 // 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer) {
@@ -55,28 +60,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将A矩阵分块复制到连续内存(RowMajor)
 void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
-  int i, j;
-  const float *Ai, *Ai1, *Ai2, *Ai3;
-  for (i = 0; i < m - m_tail; i += MR) {
-    Ai = &A(i, 0);
-    Ai1 = &A(i + 1, 0);
-    Ai2 = &A(i + 2, 0);
-    Ai3 = &A(i + 3, 0);
+  const float *a0, *a1, *a2, *a3;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
    for (int j = 0; j < k; ++j) {
-      *buffer++ = *Ai++;
-      *buffer++ = *Ai1++;
-      *buffer++ = *Ai2++;
-      *buffer++ = *Ai3++;
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
    }
  }
+  int i = m - m_tail;
+  a0 = &A(i, 0);
+  a1 = a0 + lda;
+  a2 = a0 + 2 * lda;
+  a3 = a0 + 3 * lda;
  if (m_tail != 0) {
-    for (j = 0; j < k; ++j) {
-      for (i = m - m_tail; i < m; ++i) {
-        *buffer++ = A(i, j);
-      }
-      for (i = m; i < m + (MR - m_tail); ++i) {
-        *buffer++ = 0;
-      }
+    if (m_tail <= 3) {
+      a3 = zero;
+    }
+    if (m_tail <= 2) {
+      a2 = zero;
+    }
+    if (m_tail <= 1) {
+      a1 = zero;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
    }
  }
 }
@@ -113,24 +129,24 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 // 将B矩阵分块复制到连续内存(RowMajor)
 void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer) {
-  int i, j;
-  const float *Bij;
-  for (j = 0; j < n - n_tail; j += NR) {
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, j);
+  const float *b0;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, j);
      asm volatile(
-          "vld1.32    {q0}, [%[Bij]]        \n\t"
-          "vst1.32    {q0}, [%[buffer]]!    \n\t"
+          "pld        [%[b0]]               \n\t"
+          "vld1.32    {q0, q1}, [%[b0]]         \n\t"
+          "vst1.32    {q0, q1}, [%[buffer]]!    \n\t"
          : [buffer] "+r"(buffer)
-          : [Bij] "r"(Bij)
-          : "memory", "q0");
+          : [b0] "r"(b0)
+          : "memory", "q0", "q0");
    }
  }
  if (n_tail != 0) {
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, n - n_tail);
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, n - n_tail);
      for (int j = n - n_tail; j < n; ++j) {
-        *buffer++ = *Bij++;
+        *buffer++ = *b0++;
      }
      for (int j = n; j < n + (NR - n_tail); ++j) {
        *buffer++ = 0;
@@ -140,118 +156,53 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
 }

 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time) {
-  int m_block = (m + MR - 1) / MR * MR;
-  int n_block = (n + NR - 1) / NR * NR;
-
-  int m_tail = m % MR;
-  int n_tail = n % NR;
-
-  if (first_time) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu) {
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+    }
  }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);

-  int i, j, mc, nc;
-
-  // B 取 4 列, 打包预热
-  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? n_tail : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                &C(i, j), ldc, mc, nc);
-    }
+  if (alpha != 1) {
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    WriteWithAdd(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    WriteWithAddRelu(mc, nc, c, C, ldc);
+    return;
  }
 }

 // 分块矩阵乘法
-void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                      const float *B, int ldb, float beta, float *C, int ldc,
-                      int first_time, bool relu = false) {
-  int m_block = (m + MR - 1) / MR * MR;
-  int n_block = (n + NR - 1) / NR * NR;
-
-  int m_tail = m % MR;
-  int n_tail = n % NR;
-
-  if (first_time) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
-  }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);
-
-  int i, j, mc, nc;
-
-  // B 取 4 列, 打包预热
-  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? n_tail : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                     &C(i, j), ldc, mc, nc, relu);
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias) {
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
    }
  }
-}
-
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-#if defined(IOS)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  // init C
-  float32x4_t cv0 = vdupq_n_f32(0.0);
-  float32x4_t cv1 = vdupq_n_f32(0.0);
-  float32x4_t cv2 = vdupq_n_f32(0.0);
-  float32x4_t cv3 = vdupq_n_f32(0.0);
-
-  float32x4_t av;
-  float32x4_t bv;
-
-  float32x2_t av01;
-  float32x2_t av23;
-
-  for (int p = 0; p < k; p += 1) {
-    av = vld1q_f32(a);
-    bv = vld1q_f32(b);
-
-    av01 = vget_low_f32(av);
-    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
-    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
-    av23 = vget_high_f32(av);
-    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
-    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);

-    a += MR;
-    b += NR;
-  }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
-      }
-    }
+  if (relu) {
+    WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
+  } else {
+    WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
  }
 }

-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
+#if defined(IOS)
+void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
  // init C
  float32x4_t cv0 = vdupq_n_f32(0.0);
  float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -296,30 +247,22 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
      } else if (j == 3) {
        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
      }
-      if (C(i, j) < 0) {
-        C(i, j) = 0;
-      }
    }
  }
 }
+}  // namespace math

 #elif defined(ARMV7)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
-  }
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
  asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
      "vmov.f32   q10,    #0.0        \n\t"
      "vmov.f32   q11,    #0.0        \n\t"
      "vmov.f32   q12,    #0.0        \n\t"
@@ -328,20 +271,10 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
      "subs       %[kc1], %[kc1], #1  \n\t"
      "blt        end_kc1_%=          \n\t"
      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
      "vmla.f32   q10, q2, d0[0]      \n\t"
      "vmla.f32   q11, q2, d0[1]      \n\t"
      "vmla.f32   q12, q2, d1[0]      \n\t"
@@ -350,6 +283,16 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
      "vmla.f32   q11, q3, d2[1]      \n\t"
      "vmla.f32   q12, q3, d3[0]      \n\t"
      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q4, q5}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q6, q7}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q10, q6, d8[0]      \n\t"
+      "vmla.f32   q11, q6, d8[1]      \n\t"
+      "vmla.f32   q12, q6, d9[0]      \n\t"
+      "vmla.f32   q13, q6, d9[1]      \n\t"
+      "vmla.f32   q10, q7, d10[0]     \n\t"
+      "vmla.f32   q11, q7, d10[1]     \n\t"
+      "vmla.f32   q12, q7, d11[0]     \n\t"
+      "vmla.f32   q13, q7, d11[1]     \n\t"
      "subs       %[kc1], %[kc1], #1  \n\t"
      "bge        loop_kc1_%=         \n\t"
      "end_kc1_%=:                    \n\t"
@@ -357,8 +300,8 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
      "subs       %[kc2], %[kc2], #1  \n\t"
      "blt        end_kc2_%=          \n\t"
      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vld1.32    {q0}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q1}, [%[b_ptr]]!   \n\t"
      "vmla.f32   q10, q1, d0[0]      \n\t"
      "vmla.f32   q11, q1, d0[1]      \n\t"
      "vmla.f32   q12, q1, d1[0]      \n\t"
@@ -367,438 +310,389 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
      "bge        loop_kc2_%=         \n\t"
      "end_kc2_%=:                    \n\t"

-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
      "vst1.32    {q10}, [r5], r6     \n\t"
      "vst1.32    {q11}, [r5], r6     \n\t"
      "vst1.32    {q12}, [r5], r6     \n\t"
      "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
-
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
      :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-      }
-    }
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q10", "q11", "q12", "q13");
+}
+
+#else
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  float *c0, *c1, *c2, *c3;
+  c0 = c;
+  c1 = c + ldc;
+  c2 = c + 2 * ldc;
+  c3 = c + 3 * ldc;
+  for (int p = 0; p < k; p += 1) {
+    // first row
+    c0[0] += a[0] * b[0];
+    c0[1] += a[0] * b[1];
+    c0[2] += a[0] * b[2];
+    c0[3] += a[0] * b[3];
+
+    // second row
+    c1[0] += a[1] * b[0];
+    c1[1] += a[1] * b[1];
+    c1[2] += a[1] * b[2];
+    c1[3] += a[1] * b[3];
+
+    // third row
+    c2[0] += a[2] * b[0];
+    c2[1] += a[2] * b[1];
+    c2[2] += a[2] * b[2];
+    c2[3] += a[2] * b[3];
+
+    // fourth row
+    c3[0] += a[3] * b[0];
+    c3[1] += a[3] * b[1];
+    c3[2] += a[3] * b[2];
+    c3[3] += a[3] * b[3];
+
+    a += 4;
+    b += 4;
  }
 }

-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
+#endif
+
+// 32位 float 矩阵乘法
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
+  int L2 = 1 * 1024 * 1024;
+
+  KC = k;
+  MC = L2 / (2 * KC * sizeof(float));
+  NC = MC;
+
+  // make sure MC is multiple of 4, and NC is multiple of 8
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
  }
-  asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"

-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
+                  relu);
+    }
+  }

-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
-      "vmla.f32   q10, q1, d0[0]      \n\t"
-      "vmla.f32   q11, q1, d0[1]      \n\t"
-      "vmla.f32   q12, q1, d1[0]      \n\t"
-      "vmla.f32   q13, q1, d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}

-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "vmax.f32 q10, q10, q14         \n\t"
-      "vmax.f32 q11, q11, q14         \n\t"
-      "vmax.f32 q12, q12, q14         \n\t"
-      "vmax.f32 q13, q13, q14         \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vst1.32    {q10}, [r5], r6     \n\t"
-      "vst1.32    {q11}, [r5], r6     \n\t"
-      "vst1.32    {q12}, [r5], r6     \n\t"
-      "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
+  int L2 = 1 * 1024 * 1024;
+
+  KC = k;
+  MC = L2 / (2 * KC * sizeof(float));
+  NC = MC;
+
+  // make sure MC is multiple of 4, and NC is multiple of 8
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }

-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
-      :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
-        "q14");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-        if (relu) {
-          if (C(i, j) < 0) {
-            C(i, j) = 0;
-          }
-        }
-      }
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
+                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
    }
  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
 }

-#else
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));

-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;

-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"

-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"

-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"

-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"

-    // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+
+        "pld        [%[b1], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
+        "vmla.f32   q10, q2, d0[1]        \n\t"
+        "vmla.f32   q11, q3, d0[1]        \n\t"
+        "vmla.f32   q12, q4, d0[1]        \n\t"
+        "vmla.f32   q13, q5, d0[1]        \n\t"
+
+        "pld        [%[b2], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
+        "vmla.f32   q10, q2, d1[0]        \n\t"
+        "vmla.f32   q11, q3, d1[0]        \n\t"
+        "vmla.f32   q12, q4, d1[0]        \n\t"
+        "vmla.f32   q13, q5, d1[0]        \n\t"
+
+        "pld        [%[b3], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
+        "vmla.f32   q10, q2, d1[1]        \n\t"
+        "vmla.f32   q11, q3, d1[1]        \n\t"
+        "vmla.f32   q12, q4, d1[1]        \n\t"
+        "vmla.f32   q13, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        ii_eq0_%=             \n\t"
+        "bne        ii_ne0_%=             \n\t"
+
+        "ii_eq0_%=:                       \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "b          gemm_nc2_%=           \n\t"
+
+        "ii_ne0_%=:                       \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "pld        [%[b0], #16]          \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "pld        [%[b1], #16]          \n\t"
+        "vld1.32    {q3}, [%[b1]]!        \n\t"
+        "vmla.f32   q10, q3, d0[1]        \n\t"
+
+        "pld        [%[b2], #16]          \n\t"
+        "vld1.32    {q4}, [%[b2]]!        \n\t"
+        "vmla.f32   q10, q4, d1[0]        \n\t"
+
+        "pld        [%[b3], #16]          \n\t"
+        "vld1.32    {q5}, [%[b3]]!        \n\t"
+        "vmla.f32   q10, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      if (i == 0) {
+        *c0 = (*a0) * (*b0++);
      } else {
-        C(i, j) += c[i * MR + j];
+        *c0 += (*a0) * (*b0++);
      }
+      *c0 += (*(a0 + 1)) * (*b1++);
+      *c0 += (*(a0 + 2)) * (*b2++);
+      *c0 += (*(a0 + 3)) * (*b3++);
+      c0++;
    }
  }
-}

-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
+  for (int i = 0; i < kc2; ++i) {
+    a0 = A + 4 * kc1 + i;
+    b0 = B + (4 * kc1 + i) * ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {d0}, [%[a0]]         \n\t"

-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"

-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"

-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"

-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"

-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"

-    // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-      if (relu) {
-        if (C(i, j) < 0) {
-          C(i, j) = 0;
-        }
-      }
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      *c0 += (*a0) * (*b0++);
+      c0++;
    }
  }
-}

-#endif
-
-// 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-  if (m == 1) {
-    VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  if (alpha != 1) {
+    VecWriteWithAlphaBeta(n, bufferC, C, ldc);
    return;
  }
-  for (j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
-        InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                    &C(i, j), ldc, i == 0);
-      }
-    }
+  if (beta == 0) {
+    VecWriteBasic(n, bufferC, C, ldc);
+    return;
  }
-}
-
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-  for (j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
-
-        if (p + KC >= k) {
-          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
-                           beta_, &C(i, j), ldc, i == 0, true);
-        } else {
-          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                      &C(i, j), ldc, i == 0);
-        }
-      }
-    }
+  if (beta == 1 && !relu) {
+    VecWriteWithAdd(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    VecWriteWithAddRelu(n, bufferC, C, ldc);
+    return;
  }
 }

-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc) {
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias) {
  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));

  const float *a0, *b0, *b1, *b2, *b3;
@@ -1000,17 +894,773 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
    }
  }

-  c0 = bufferC;
-  C0 = C;
-  for (int i = 0; i < n; i++) {
-    if (beta == 1.0) {
-      *C0++ += *c0++;
-    } else {
-      *C0++ = *c0++;
+  if (relu) {
+    VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
+  } else {
+    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
+  }
+}
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
+
+      "vmov.f32   q8,     #0.0        \n\t"
+      "vmov.f32   q9,     #0.0        \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "vmov.f32   q14,    #0.0        \n\t"
+      "vmov.f32   q15,    #0.0        \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0},     [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
+      "vst1.32    {q8, q9},   [r5], r6     \n\t"
+      "vst1.32    {q10, q11}, [r5], r6     \n\t"
+      "vst1.32    {q12, q13}, [r5], r6     \n\t"
+      "vst1.32    {q14, q15}, [r5]         \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q0, q1}, [r6]!         \n\t"
+
+        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q2, q3}, [r6]!         \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ = *c0++;
+      }
+    }
+  }
+}
+
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ += *c0++;
+      }
    }
  }
 }

-}  // namespace math
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0 += *c0;
+        if (*C0 < 0) {
+          *C0 = 0;
+        }
+        C0++;
+        c0++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                 float *bias) {
+  int volatile nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = 16 - 4 * (_nc1 % 4);
+  int volatile step = 4 * (ldc - nc);
+  int volatile step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "vld1.32    {d0},   [%[scale]]      \n\t"
+      "vld1.32    {d1},   [%[bias]]       \n\t"
+      "vdup.32    q1,   d0[0]             \n\t"
+      "vdup.32    q2,   d1[0]             \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+      "vmul.f32   q10,  q3,   q1          \n\t"
+      "vmul.f32   q11,  q4,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
+      "vmul.f32   q12,  q5,   q1          \n\t"
+      "vmul.f32   q13,  q6,   q1          \n\t"
+      "vadd.f32   q12,  q12,  q2          \n\t"
+      "vadd.f32   q13,  q13,  q2          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q7},       [%[c]]!     \n\t"
+      "vmul.f32   q10,  q7,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vst1.32    {q10},      [%[C]]!     \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q8},       [%[c]]!     \n\t"
+      "vmul.f32   q11,  q8,   q1          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vst1.32    {q11},      [%[C]]!     \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[scale], %[scale], #4        \n\t"
+      "add        %[bias],  %[bias],  #4        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q10", "q11", "q12", "q13");
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                     float *bias) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "vmov.f32   q14,    #0.0            \n\t"
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "vld1.32    {d0},   [%[scale]]      \n\t"
+      "vld1.32    {d1},   [%[bias]]       \n\t"
+      "vdup.32    q1,   d0[0]             \n\t"
+      "vdup.32    q2,   d1[0]             \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+      "vmul.f32   q10,  q3,   q1          \n\t"
+      "vmul.f32   q11,  q4,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
+      "vmul.f32   q12,  q5,   q1          \n\t"
+      "vmul.f32   q13,  q6,   q1          \n\t"
+      "vadd.f32   q12,  q12,  q2          \n\t"
+      "vadd.f32   q13,  q13,  q2          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q7},       [%[c]]!     \n\t"
+      "vmul.f32   q10,  q7,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vst1.32    {q10},      [%[C]]!     \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q8},       [%[c]]!     \n\t"
+      "vmul.f32   q11,  q8,   q1          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q11},      [%[C]]!     \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[scale], %[scale], #4        \n\t"
+      "add        %[bias],  %[bias],  #4        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q10", "q11", "q12", "q13", "q14");
+}
+
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1}, [%[c]]!       \n\t"
+      "vst1.32    {q0, q1}, [%[C]]!       \n\t"
+
+      "vld1.32    {q2, q3}, [%[c]]!       \n\t"
+      "vst1.32    {q2, q3}, [%[C]]!       \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q4},     [%[c]]!       \n\t"
+      "vst1.32    {q4},     [%[C]]!       \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+      "sub        %[c],     %[c],   %[nc3]    \n\t"
+      "sub        %[C],     %[C],   %[nc3]    \n\t"
+      "vld1.32    {q5},     [%[c]]!       \n\t"
+      "vst1.32    {q5},     [%[C]]!       \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+}
+
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[C]]      \n\t"
+      "vadd.f32   q10,  q0,   q2          \n\t"
+      "vadd.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[C]]      \n\t"
+      "vadd.f32   q12,  q4,   q6          \n\t"
+      "vadd.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      : [C] "+r"(C), [c] "+r"(c)
+      : [nc1] "r"(nc1)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+
+  if (_nc1 != 0) {
+    for (int j = 0; j < _nc1; j++) {
+      *C++ += *c++;
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+
+  asm volatile(
+      "vmov.f32   q14,      #0.0          \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[C]]      \n\t"
+      "vadd.f32   q10,  q0,   q2          \n\t"
+      "vadd.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[C]]      \n\t"
+      "vadd.f32   q12,  q4,   q6          \n\t"
+      "vadd.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      : [C] "+r"(C), [c] "+r"(c)
+      : [nc1] "r"(nc1)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+
+  if (_nc1 != 0) {
+    for (int j = 0; j < _nc1; j++) {
+      *C += *c;
+      if (*C < 0) {
+        *C = 0;
+      }
+      C++;
+      c++;
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+                    float *bias) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+                        float *bias) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+
+  asm volatile(
+      "vmov.f32   q14,      #0.0          \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13", "q14");
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
+}  // namespace paddle_mobile
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,12 +19,8 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]

-// 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 128
-#define KC 128
-#define NC 1024
 #define MR 4
-#define NR 4
+#define NR 8

 #define s_min(i, j) ((i) < (j) ? (i) : (j))

@@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);

 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time);
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu);
+
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias);

 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc);
-
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
-               int ldb, float beta, float *C, int ldc, int mc, int nc);
-
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu);
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu);
+
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias);
+
+// 计算一个更小的 C 矩阵分块
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias);
+
+// 向量矩阵乘法结果回写
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                    float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                        float *new_bias);

 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc);
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu);

-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc);
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias);

 // 64位 double 矩阵乘法
 void dgemm(int m, int n, int k, float alpha, const double *A, int lda,

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -39,22 +39,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,

  int M = dim_out[0];
  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];

-  if (relu) {
-    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
-               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
-  } else {
-    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N);
-  }
+  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+        beta, matrix_out->data<float>(), N, relu);
 }

 template <>
-void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
-                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta,
-                    bool relu) {
+void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
+                         const framework::Tensor &matrix_b, bool trans_b,
+                         float alpha, framework::Tensor *matrix_out, float beta,
+                         bool relu, framework::Tensor *new_scale,
+                         framework::Tensor *new_bias) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -71,7 +67,11 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,

  int M = dim_out[0];
  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+
+  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu,
+              new_scale->data<float>(), new_bias->data<float>());
 }

 }  // namespace math

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -26,6 +26,12 @@ template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false);
+
+template <typename T>
+void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
+                  framework::Tensor *matrix_out, T beta, bool relu,
+                  framework::Tensor *new_scale, framework::Tensor *new_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -21,7 +21,10 @@ namespace math {

 void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#if __ARM_NEON
+#ifdef __ARM_NEON
+
+#ifdef ARMV7
+
  const int batch_size = input->dims()[0];

  const int input_height = input->dims()[2];
@@ -90,11 +93,15 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
    output_data += output_batch_stride;
  }
 #endif
+
+#endif
 }

 void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#if __ARM_NEON
+#ifdef __ARM_NEON
+
+#ifdef ARMV7
  const int batch_size = input->dims()[0];

  const int input_height = input->dims()[2];
@@ -164,6 +171,12 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
    input_data += input_batch_stride;
    output_data += output_batch_stride;
  }
+#else
+
+// TODO(): to imp other asm
+
+#endif
+
 #endif
 }


--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #pragma once

 #include "framework/tensor.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
 namespace paddle_mobile {

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef POOL_OP
-#include "pool_3x3.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/tensor.h"
+#include "pool_3x3.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
@@ -40,46 +43,52 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
  const int w_out = output->dims()[3];
  const int outputdata_channel_stride = h_out * w_out;
  const int inputdata_channel_stride = h_in * w_in;
+  const int input_batch_stride = output_channels * inputdata_channel_stride;
+  const int output_batch_stride = output_channels * outputdata_channel_stride;
  float *out_data = output->data<float>();
  const float *input_data = input->data<float>();
+
  const float coef = 1.0 / 9.0;
  for (int k = 0; k < batch_size; ++k) {
+#pragma omp parallel for
    for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * inputdata_channel_stride;
+      float *output_seg = out_data + c * outputdata_channel_stride;
      // four corner point
-      out_data[0] = (input_data[0] + input_data[1] + input_data[w_in] +
-                     input_data[w_in + 1]) *
-                    coef;
-      out_data[w_out - 1] =
-          (input_data[w_in - 2] + input_data[w_in - 1] +
-           input_data[w_in * 2 - 2] + input_data[2 * w_in - 1]) *
+      output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
+                       input_seg[w_in + 1]) *
+                      coef;
+      output_seg[w_out - 1] =
+          (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - 2] +
+           input_seg[2 * w_in - 1]) *
          coef;
-      out_data[(h_out - 1) * w_out] =
-          (input_data[(h_in - 2) * w_in] + input_data[(h_in - 2) * w_in + 1] +
-           input_data[(h_in - 1) * w_in] + input_data[(h_in - 1) * w_in + 1]) *
+      output_seg[(h_out - 1) * w_out] =
+          (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
+           input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) *
          coef;
-      out_data[h_out * w_out - 1] =
-          (input_data[h_in * w_in - 1] + input_data[h_in * w_in - 2] +
-           input_data[(h_in - 1) * w_in - 1] +
-           input_data[(h_in - 1) * w_in - 2]) *
+      output_seg[h_out * w_out - 1] =
+          (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
+           input_seg[(h_in - 1) * w_in - 1] +
+           input_seg[(h_in - 1) * w_in - 2]) *
          coef;
      // left side & right side
      for (int i = 1; i < h_in - 1; ++i) {
-        out_data[i * w_out] =
-            (input_data[i * w_in - w_in] + input_data[i * w_in - w_in + 1] +
-             input_data[i * w_in] + input_data[i * w_in + 1] +
-             input_data[i * w_in + w_in] + input_data[i * w_in + w_in + 1]) *
+        output_seg[i * w_out] =
+            (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
+             input_seg[i * w_in] + input_seg[i * w_in + 1] +
+             input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
            coef;
-        out_data[i * w_out + w_out - 1] =
-            (input_data[i * w_in - w_in + w_in - 2] +
-             input_data[i * w_in - w_in + 1 + w_in - 2] +
-             input_data[i * w_in + w_in - 2] +
-             input_data[i * w_in + 1 + w_in - 2] +
-             input_data[i * w_in + w_in + w_in - 2] +
-             input_data[i * w_in + w_in + 1 + w_in - 2]) *
+        output_seg[i * w_out + w_out - 1] =
+            (input_seg[i * w_in - w_in + w_in - 2] +
+             input_seg[i * w_in - w_in + 1 + w_in - 2] +
+             input_seg[i * w_in + w_in - 2] +
+             input_seg[i * w_in + 1 + w_in - 2] +
+             input_seg[i * w_in + w_in + w_in - 2] +
+             input_seg[i * w_in + w_in + 1 + w_in - 2]) *
            coef;
      }
      // top 1 row & bottom 1 row
-      const float *input_tmp = input_data;
+      const float *input_tmp = input_seg;

      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
          tmp3, tmp4, tmp5, sum, out0;
@@ -90,7 +99,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
      in4 = vld1q_f32(input_tmp_end);
      in6 = vld1q_f32(input_tmp_end + w_in);
      int c_mid = w_out - 2;
-      auto output_ptr = out_data + 1;
+      auto output_ptr = output_seg + 1;
      for (; c_mid > 3; c_mid -= 4) {
        in1 = vld1q_f32(input_tmp + 4);
        in3 = vld1q_f32(input_tmp + w_in + 4);
@@ -135,8 +144,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
        in6 = in7;
      }
      // top right remain
-      float32x4_t pad0 = vdupq_n_f32(input_data[w_in - 1]);
-      float32x4_t pad1 = vdupq_n_f32(input_data[2 * w_in - 1]);
+      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);

      tmp0 = vextq_f32(in0, pad0, 1);
      tmp1 = vextq_f32(in0, pad0, 2);
@@ -163,8 +172,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
      }

      // bottom_right remain
-      float32x4_t pad2 = vdupq_n_f32(input_data[(h_in - 1) * w_in - 1]);
-      float32x4_t pad3 = vdupq_n_f32(input_data[h_in * w_in - 1]);
+      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);

      tmp0 = vextq_f32(in4, pad2, 1);
      tmp1 = vextq_f32(in4, pad2, 2);
@@ -191,8 +200,8 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
      }
      // mid
      for (int j = 0; j < h_out - 2; ++j) {
-        output_ptr = out_data + w_out * (j + 1) + 1;
-        input_tmp = input_data + j * w_in;
+        output_ptr = output_seg + w_out * (j + 1) + 1;
+        input_tmp = input_seg + j * w_in;

        in0 = vld1q_f32(input_tmp);
        in2 = vld1q_f32(input_tmp + w_in);
@@ -228,9 +237,9 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
          in4 = in5;
        }
        // mid remain
-        float32x4_t pad0 = vdupq_n_f32(input_data[(j + 1) * w_in - 1]);
-        float32x4_t pad1 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]);
-        float32x4_t pad2 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]);
+        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);

        tmp0 = vextq_f32(in0, pad0, 1);
        tmp1 = vextq_f32(in0, pad0, 2);
@@ -261,9 +270,11 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
          }
        }
      }
-      input_data += inputdata_channel_stride;
-      out_data += outputdata_channel_stride;
+      //      input_data += inputdata_channel_stride;
+      //      out_data += outputdata_channel_stride;
    }
+    input_data += input_batch_stride;
+    out_data += output_batch_stride;
  }
 #endif
 }
@@ -282,44 +293,50 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
  const int w_out = output->dims()[3];
  const int outputdata_channel_stride = h_out * w_out;
  const int inputdata_channel_stride = h_in * w_in;
+  const int input_batch_stride = output_channels * inputdata_channel_stride;
+  const int output_batch_stride = output_channels * outputdata_channel_stride;
  float *out_data = output->data<float>();
  const float *input_data = input->data<float>();
  for (int k = 0; k < batch_size; ++k) {
+#pragma omp parallel for
    for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * inputdata_channel_stride;
+      float *output_seg = out_data + c * outputdata_channel_stride;
      // four corner point
-      out_data[0] = std::max(std::max(input_data[0], input_data[1]),
-                             std::max(input_data[w_in], input_data[w_in + 1]));
-      out_data[w_out - 1] = std::max(
-          std::max(input_data[w_in - 2], input_data[w_in - 1]),
-          std::max(input_data[w_in * 2 - 2], input_data[2 * w_in - 1]));
-      out_data[(h_out - 1) * w_out] =
-          std::max(std::max(input_data[(h_in - 2) * w_in],
-                            input_data[(h_in - 2) * w_in + 1]),
-                   std::max(input_data[(h_in - 1) * w_in],
-                            input_data[(h_in - 1) * w_in + 1]));
-      out_data[h_out * w_out - 1] = std::max(
-          std::max(input_data[(h_in - 1) * w_in - 1],
-                   input_data[(h_in - 1) * w_in - 2]),
-          std::max(input_data[h_in * w_in - 1], input_data[h_in * w_in - 2]));
+      output_seg[0] = std::max(std::max(input_seg[0], input_seg[1]),
+                               std::max(input_seg[w_in], input_seg[w_in + 1]));
+      output_seg[w_out - 1] =
+          std::max(std::max(input_seg[w_in - 2], input_seg[w_in - 1]),
+                   std::max(input_seg[w_in * 2 - 2], input_seg[2 * w_in - 1]));
+      output_seg[(h_out - 1) * w_out] =
+          std::max(std::max(input_seg[(h_in - 2) * w_in],
+                            input_seg[(h_in - 2) * w_in + 1]),
+                   std::max(input_seg[(h_in - 1) * w_in],
+                            input_seg[(h_in - 1) * w_in + 1]));
+      output_seg[h_out * w_out - 1] = std::max(
+          std::max(input_seg[(h_in - 1) * w_in - 1],
+                   input_seg[(h_in - 1) * w_in - 2]),
+          std::max(input_seg[h_in * w_in - 1], input_seg[h_in * w_in - 2]));
      // left side & right side
      for (int i = 1; i < h_in - 1; ++i) {
-        float max1 = std::max(input_data[i * w_in - w_in],
-                              input_data[i * w_in - w_in + 1]);
-        float max2 = std::max(input_data[i * w_in], input_data[i * w_in + 1]);
-        float max3 = std::max(input_data[i * w_in + w_in],
-                              input_data[i * w_in + w_in + 1]);
-        out_data[i * w_out] = std::max(std::max(max1, max2), max3);
-
-        max1 = std::max(input_data[i * w_in - w_in + w_in - 2],
-                        input_data[i * w_in - w_in + 1 + w_in - 2]);
-        max2 = std::max(input_data[i * w_in + w_in - 2],
-                        input_data[i * w_in + 1 + w_in - 2]);
-        max3 = std::max(input_data[i * w_in + w_in + w_in - 2],
-                        input_data[i * w_in + w_in + 1 + w_in - 2]);
-        out_data[i * w_out + w_out - 1] = std::max(std::max(max1, max2), max3);
+        float max1 = std::max(input_seg[i * w_in - w_in],
+                              input_seg[i * w_in - w_in + 1]);
+        float max2 = std::max(input_seg[i * w_in], input_seg[i * w_in + 1]);
+        float max3 = std::max(input_seg[i * w_in + w_in],
+                              input_seg[i * w_in + w_in + 1]);
+        output_seg[i * w_out] = std::max(std::max(max1, max2), max3);
+
+        max1 = std::max(input_seg[i * w_in - w_in + w_in - 2],
+                        input_seg[i * w_in - w_in + 1 + w_in - 2]);
+        max2 = std::max(input_seg[i * w_in + w_in - 2],
+                        input_seg[i * w_in + 1 + w_in - 2]);
+        max3 = std::max(input_seg[i * w_in + w_in + w_in - 2],
+                        input_seg[i * w_in + w_in + 1 + w_in - 2]);
+        output_seg[i * w_out + w_out - 1] =
+            std::max(std::max(max1, max2), max3);
      }
      // top 1 row & bottom 1 row
-      const float *input_tmp = input_data;
+      const float *input_tmp = input_seg;

      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
          tmp3, tmp4, tmp5, max;
@@ -329,7 +346,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
      in4 = vld1q_f32(input_tmp_end);
      in6 = vld1q_f32(input_tmp_end + w_in);
      int c_mid = w_out - 2;
-      auto output_ptr = out_data + 1;
+      auto output_ptr = output_seg + 1;
      for (; c_mid > 3; c_mid -= 4) {
        in1 = vld1q_f32(input_tmp + 4);
        in3 = vld1q_f32(input_tmp + w_in + 4);
@@ -373,8 +390,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
        in6 = in7;
      }
      // top right remain
-      float32x4_t pad0 = vdupq_n_f32(input_data[w_in - 1]);
-      float32x4_t pad1 = vdupq_n_f32(input_data[2 * w_in - 1]);
+      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);

      tmp0 = vextq_f32(in0, pad0, 1);
      tmp1 = vextq_f32(in0, pad0, 2);
@@ -400,8 +417,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
      }

      // bottom_right remain
-      float32x4_t pad2 = vdupq_n_f32(input_data[(h_in - 1) * w_in - 1]);
-      float32x4_t pad3 = vdupq_n_f32(input_data[h_in * w_in - 1]);
+      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);

      tmp0 = vextq_f32(in4, pad2, 1);
      tmp1 = vextq_f32(in4, pad2, 2);
@@ -427,8 +444,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
      }
      // mid
      for (int j = 0; j < h_out - 2; ++j) {
-        output_ptr = out_data + (j + 1) * w_out + 1;
-        input_tmp = input_data + j * w_in;
+        output_ptr = output_seg + (j + 1) * w_out + 1;
+        input_tmp = input_seg + j * w_in;

        in0 = vld1q_f32(input_tmp);
        in2 = vld1q_f32(input_tmp + w_in);
@@ -463,9 +480,9 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
          in4 = in5;
        }
        // mid remain
-        float32x4_t pad0 = vdupq_n_f32(input_data[(j + 1) * w_in - 1]);
-        float32x4_t pad1 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]);
-        float32x4_t pad2 = vdupq_n_f32(input_data[(j + 3) * w_in - 1]);
+        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 3) * w_in - 1]);

        tmp0 = vextq_f32(in0, pad0, 1);
        tmp1 = vextq_f32(in0, pad0, 2);
@@ -495,9 +512,11 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
          }
        }
      }
-      input_data += inputdata_channel_stride;
-      out_data += outputdata_channel_stride;
+      //      input_data += inputdata_channel_stride;
+      //      out_data += outputdata_channel_stride;
    }
+    input_data += input_batch_stride;
+    out_data += output_batch_stride;
  }
 #endif
 }
@@ -515,11 +534,11 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,

  const int output_height = output->dims()[2];
  const int output_width = output->dims()[3];
-  const int _kernel_size = 3;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
+  //  const int _kernel_size = 3;
+  const int stride = strides[0];
+  //  const int stride_width = strides[1];
+  const int padding = paddings[0];
+  //  const int padding_width = paddings[1];
  const float negative_max = -INT_MAX;
  const int input_channel_stride = input_height * input_width;
  const int output_channel_stride = output_height * output_width;
@@ -529,36 +548,39 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,

  const int input_batch_stride = output_channels * input_channel_stride;
  const int output_batch_stride = output_channels * output_channel_stride;
-  const float *pos1, *pos2, *pos3, *output_ptr;
+  const float *pos1, *output_ptr;
  int hstart, wstart, hend, wend;
  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
    for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * input_channel_stride;
+      float *output_seg = output_data + c * output_channel_stride;
      for (int ph = 0; ph < output_height; ph++) {
        for (int pw = 0; pw < output_width; pw++) {
-          hstart = ph * stride_height - padding_height;
-          wstart = pw * stride_width - padding_width;
-          hend = min(hstart + _kernel_size, input_height + padding_height);
-          wend = min(wstart + _kernel_size, input_width + padding_width);
+          int hstart = ph * stride - padding;
+          int wstart = pw * stride - padding;
+          int hend = min(hstart + 3, input_height + padding);
+          int wend = min(wstart + 3, input_width + padding);
          hstart = max(hstart, 0);
          wstart = max(wstart, 0);
          hend = min(hend, input_height);
          wend = min(wend, input_width);
-          pos1 = input_data + hstart * input_width + wstart;
-          pos2 = input_data + (hstart + 1) * input_width + wstart;
-          pos3 = input_data + (hstart + 2) * input_width + wstart;
-          output_ptr = output_data + ph * output_width + pw;
+          const float *pos1 = input_seg + hstart * input_width + wstart;
+          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          output_ptr = output_seg + ph * output_width + pw;

          if (hend - hstart != 3 || wend - wstart != 3) {
            float max_value = -INT_MAX;
            for (int h = hstart; h < hend; h++) {
              for (int w = wstart; w < wend; w++) {
-                float value = input_data[h * input_width + w];
+                float value = input_seg[h * input_width + w];
                if (value > max_value) {
                  max_value = value;
                }
              }
            }
-            output_data[ph * output_width + pw] = max_value;
+            output_seg[ph * output_width + pw] = max_value;
          } else {
 #if defined(ARMV7)
            asm volatile(
@@ -572,27 +594,25 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                "vpmax.f32  d7, d6, d6             \n\t"
                "vst1.32 {d7[0]},[%[output_ptr]]    \n\t"
                :
-                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
                  [pos2] "r"(pos2), [pos3] "r"(pos3),
                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
                : "memory", "q1", "q2", "q3", "q4");
 #else
            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
+            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
            const float32x4_t max_data =
-                vmaxq_f32(vmaxq_f32(data1, data3), data2);
+                vmaxq_f32(vmaxq_f32(data1, data2), data3);
            float32x2_t res =
                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
                          vget_low_f32(max_data));
            res = vpmax_f32(res, res);
-            output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
 #endif
          }
        }
      }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
    }
    input_data += input_batch_stride;
    output_data += output_batch_stride;
@@ -613,11 +633,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,

  const int output_height = output->dims()[2];
  const int output_width = output->dims()[3];
-  const int _kernel_size = 3;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
+  const int stride = strides[0];
+  const int padding = paddings[0];

  const int input_channel_stride = input_height * input_width;
  const int output_channel_stride = output_height * output_width;
@@ -631,30 +648,33 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
  const int input_batch_stride = output_channels * input_channel_stride;
  const int output_batch_stride = output_channels * output_channel_stride;
  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
    for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * input_channel_stride;
+      float *output_seg = output_data + c * output_channel_stride;
      for (int ph = 0; ph < output_height; ph++) {
        for (int pw = 0; pw < output_width; pw++) {
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int hend = min(hstart + _kernel_size, input_height + padding_height);
-          int wend = min(wstart + _kernel_size, input_width + padding_width);
+          int hstart = ph * stride - padding;
+          int wstart = pw * stride - padding;
+          int hend = min(hstart + 3, input_height + padding);
+          int wend = min(wstart + 3, input_width + padding);
          hstart = max(hstart, 0);
          wstart = max(wstart, 0);
          hend = min(hend, input_height);
          wend = min(wend, input_width);
-          const float *pos1 = input_data + hstart * input_width + wstart;
-          const float *pos2 = input_data + (hstart + 1) * input_width + wstart;
-          const float *pos3 = input_data + (hstart + 2) * input_width + wstart;
-          const float *output_ptr = output_data + ph * output_width + pw;
+          const float *pos1 = input_seg + hstart * input_width + wstart;
+          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          float *output_ptr = output_seg + ph * output_width + pw;

          if (hend - hstart != 3 || wend - wstart != 3) {
            float sum = 0;
            for (int h = hstart; h < hend; h++) {
              for (int w = wstart; w < wend; w++) {
-                sum += input_data[h * input_width + w];
+                sum += input_seg[h * input_width + w];
              }
            }
-            output_data[ph * output_width + pw] = sum / 9.0;
+            output_seg[ph * output_width + pw] = sum / 9.0;
          } else {
 #if defined(ARMV7)

@@ -671,7 +691,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                "vmul.f32 d6,d7                     \n\t"
                "vst1.32 {d6[0]},[%[output_ptr]]    \n\t"
                :
-                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
                  [pos2] "r"(pos2), [pos3] "r"(pos3),
                  [output_ptr] "r"(output_ptr), [zero] "r"(zero),
                  [nine_ptr] "r"(nine_ptr)
@@ -686,13 +706,11 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
                          vget_low_f32(sum_data));
            res = vpadd_f32(res, res);
-            output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
 #endif
          }
        }
      }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
    }
    input_data += input_batch_stride;
    output_data += output_batch_stride;

--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef POOL_OP

 #pragma once
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include <algorithm>
 #include <vector>
 #include "framework/tensor.h"

--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -16,6 +16,9 @@ limitations under the License. */

 #include "pooling.h"
 #include "common/types.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif

 namespace paddle_mobile {
 namespace operators {
@@ -57,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> {
    T *output_data = output->mutable_data<T>();

    for (int i = 0; i < batch_size; i++) {
-      #pragma omp parallel for
      for (int c = 0; c < output_channels; ++c) {
+#pragma omp parallel for
        for (int ph = 0; ph < output_height; ++ph) {
          int hstart = ph * stride_height - padding_height;
          int hend = std::min(hstart + ksize_height, input_height);

--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "operators/math/softmax.h"
 #include "common/types.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <math.h>
 #include <algorithm>
 #include "operators/math/math_func_neon.h"
@@ -29,7 +29,7 @@ using framework::DDim;
 using framework::Tensor;
 template <typename T>
 class SoftmaxFuntor<CPU, T> {
-#if __ARM_NEON
+#ifdef __ARM_NEON
  void sum(float *input, float *sumptr, int inner_size, int outter_size) {
    float32x4_t acc = vdupq_n_f32(0);
    float sum_ = 0;
@@ -144,7 +144,7 @@ class SoftmaxFuntor<CPU, T> {
      framework::Tensor sub_X = X->Slice(i, i + 1);
      framework::Tensor sub_Y = Y->Slice(i, i + 1);

-#if __ARM_NEON
+#ifdef __ARM_NEON
      SoftmaxCacl(&sub_X, &sub_Y);
 #endif
    }

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -56,11 +56,9 @@ template class MulOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(mul);
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(mul);
 REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -46,4 +46,13 @@ class MulOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -40,7 +40,6 @@ template class MultiClassNMSOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(multiclass_nms);
 REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -52,4 +52,12 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(multiclass_nms);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -715,6 +715,123 @@ class ReshapeParam : public OpParam {
 };
 #endif

+#ifdef SCALE_OP
+class ScaleParam : public OpParam {
+ public:
+  ScaleParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    inplace_ = GetAttr<bool>("inplace", attrs);
+    has_bias_ = GetAttr<bool>("has_bias", attrs);
+    scales_ = GetAttr<vector<float>>("scales", attrs);
+    biases_ = GetAttr<vector<float>>("biases", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  Tensor *Out() const { return out_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+  const bool &HasBias() const { return has_bias_; }
+
+  const vector<float> &Scales() const { return scales_; }
+
+  const vector<float> &Biases() const { return biases_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *input_bias_;
+  Tensor *out_;
+  bool inplace_;
+  bool has_bias_;
+  vector<float> scales_;
+  vector<float> biases_;
+};
+#endif
+
+#ifdef SLICE_OP
+class SliceParam : public OpParam {
+ public:
+  SliceParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_shape_ = InputShapeFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    slice_points_ = GetAttr<vector<int>>("slice_points", attrs);
+    inplace_ = GetAttr<bool>("inplace", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  const Tensor *InputShape() const { return input_shape_; }
+
+  Tensor *Out() const { return out_; }
+
+  const int &Axis() const { return axis_; }
+
+  const vector<int> &SlicePoints() const { return slice_points_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *input_shape_;
+  Tensor *out_;
+  int axis_;
+  vector<int> slice_points_;
+  bool inplace_;
+};
+#endif
+
+#ifdef RESIZE_OP
+class ResizeParam : public OpParam {
+ public:
+  ResizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_shape_ = InputShapeFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    is_pyramid_test_ = GetAttr<bool>("is_pyramid_test", attrs);
+    height_ = GetAttr<int>("height", attrs);
+    width_ = GetAttr<int>("width", attrs);
+    out_height_scale_ = GetAttr<float>("out_height_scale", attrs);
+    out_width_scale_ = GetAttr<float>("out_width_scale", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  const Tensor *InputShape() const { return input_shape_; }
+
+  Tensor *Out() const { return out_; }
+
+  const bool &IsPyramidTest() const { return is_pyramid_test_; }
+
+  const int &Height() const { return height_; }
+
+  const int &Width() const { return width_; }
+
+  const float &OutHeightScale() const { return out_height_scale_; }
+
+  const float &OutWidthScale() const { return out_width_scale_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *input_shape_;
+  Tensor *out_;
+  bool is_pyramid_test_;
+  int height_;
+  int width_;
+  float out_height_scale_;
+  float out_width_scale_;
+};
+#endif
+
 #ifdef RELU_OP
 /*
 * @b op 层实例化好这个 param 传递给 kernel 层使用
@@ -737,6 +854,27 @@ class ReluParam : public OpParam {
 };
 #endif

+#ifdef PRELU_OP
+class PReluParam : public OpParam {
+ public:
+  PReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    slopes_ = GetAttr<vector<float>>("slopes", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+  Tensor *Out() const { return out_; }
+  const vector<float> &Slopes() const { return slopes_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *out_;
+  vector<float> slopes_;
+};
+#endif
+
 #ifdef FUSION_FC_OP
 class FusionFcParam : public OpParam {
 public:

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -60,11 +60,9 @@ template class PoolOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(pool2d);
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(pool2d);
 REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -48,4 +48,13 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#include "operators/prelu_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void PReluOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+template class PReluOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+/*
+ * @b 每一个 op 都需要注册一下的,
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
+ * 都是需要和model中类型对应起来的
+ * */
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prelu);
+REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(prelu);
+REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/prelu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class PReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, PReluParam, operators::PReluKernel<DeviceType, T>> {
+ public:
+  PReluOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, PReluParam,
+                                      operators::PReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, PReluParam,
+      operators::PReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -50,7 +50,6 @@ template class PriorBoxOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prior_box);
 REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -51,4 +51,12 @@ class PriorBoxOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prior_box);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -34,11 +34,9 @@ template class ReluOp<CPU, float>;
 * */
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(relu);
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(relu);
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -53,4 +53,13 @@ class ReluOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -33,11 +33,9 @@ template class ReshapeOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(reshape);
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(reshape);
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -51,4 +51,14 @@ class ReshapeOp
 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#include "operators/resize_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ResizeOp<Dtype, T>::InferShape() const {
+  auto out_dims = CalOutputShape(this->param_);
+  this->param_.Out()->Resize(out_dims);
+}
+template class ResizeOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(resize);
+REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(resize);
+REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/resize_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class ResizeOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ResizeParam, operators::ResizeKernel<DeviceType, T>> {
+ public:
+  ResizeOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, const framework::AttributeMap attrs,
+           std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ResizeParam,
+                                      operators::ResizeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ResizeParam,
+      operators::ResizeKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#include "operators/scale_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ScaleOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+template class ScaleOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(scale);
+REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(scale);
+REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/scale_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class ScaleOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ScaleParam, operators::ScaleKernel<DeviceType, T>> {
+ public:
+  ScaleOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ScaleParam,
+                                      operators::ScaleKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ScaleParam,
+      operators::ScaleKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -28,7 +28,6 @@ template class SigmoidOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(sigmoid);
 REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -46,4 +46,12 @@ class SigmoidOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(sigmoid);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#include "operators/slice_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void SliceOp<Dtype, T>::InferShape() const {
+  /// todo: add InputShape() detection.
+}
+template class SliceOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(slice);
+REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(slice);
+REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/slice_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class SliceOp
+    : public framework::OperatorWithKernel<
+          DeviceType, SliceParam, operators::SliceKernel<DeviceType, T>> {
+ public:
+  SliceOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, SliceParam,
+                                      operators::SliceKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, SliceParam,
+      operators::SliceKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -28,11 +28,9 @@ template class SoftmaxOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(softmax);
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(softmax);
 REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA

--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -48,4 +48,13 @@ class SoftmaxOp
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -53,7 +53,6 @@ template class TransposeOp<CPU, float>;

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(transpose);
 REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU

--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -50,4 +50,12 @@ class TransposeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(transpose);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
--- a/test/common/test_gemm.cpp
+++ b/test/common/test_gemm.cpp
@@ -52,8 +52,9 @@ int main() {
  }

  auto time1 = time();
-  paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c,
-                                        ldc);
+  //  paddle_mobile::operators::math::Sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3,
+  //  c,
+  //                                        ldc);
  auto time2 = time();
  DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
  for (int i = 0; i < m * n; ++i) {

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -26,16 +26,17 @@ int main() {
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
+  executor.SetThreadNum(4);
  std::vector<float> input;
  std::vector<int64_t> dims{1, 3, 224, 224};
  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
  auto time3 = time();
-
-  for (int i = 0; i < 10; ++i) {
+  int count = 1;
+  for (int i = 0; i < count; ++i) {
    executor.Predict(input, dims);
  }

  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
+  DLOG << "predict cost :" << time_diff(time3, time4) / count << "ms\n";
  return 0;
 }
--- a/test/operators/test_prelu_op.cpp
+++ b/test/operators/test_prelu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/prelu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::PReluOp<paddle_mobile::CPU, float>>
+      executor(program, "prelu");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
--- a/test/operators/test_resize_op.cpp
+++ b/test/operators/test_resize_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/resize_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ResizeOp<paddle_mobile::CPU, float>>
+      executor(program, "resize");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
+  auto output =
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "input : ";
+  for (int j = 0; j < input.numel(); ++j) {
+    DLOG << " index " << j << " : " << input_ptr[j];
+  }
+
+  DLOG << "output : ";
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " index " << j << " : " << output_ptr[j];
+  }
+
+  return 0;
+}
--- a/test/operators/test_scale_op.cpp
+++ b/test/operators/test_scale_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/scale_op.h"
+
+int main() {}
--- a/test/operators/test_slice_op.cpp
+++ b/test/operators/test_slice_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/slice_op.h"
+
+int main() {}
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -89,7 +89,7 @@ build_for_android() {
 }

 build_for_ios() {
-    rm -rf "../build"
+#    rm -rf "../build"
    PLATFORM="ios"
    MODE="Release"
    BUILD_DIR=../build/release/"${PLATFORM}"
@@ -119,6 +119,10 @@ build_for_ios() {
    fi
    cd "${BUILD_DIR}"
    make -j 8
+    cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h
+    cd ./build
+    # 生成符号表
+    ranlib *.a
 }

 build_error() {
@@ -157,4 +161,4 @@ else
 		    build_error
 	    fi
 	fi
-fi
+fi
\ No newline at end of file
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -19,7 +19,8 @@ elseif (NET STREQUAL "mobilenet")
  set(BATCHNORM_OP ON)
  set(POOL_OP ON)
  set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
 elseif (NET STREQUAL "yolo")
  set(BATCHNORM_OP ON)
  set(CONV_OP ON)
@@ -63,6 +64,10 @@ else ()
  set(TRANSPOSE_OP ON)
  set(FUSION_CONVADD_RELU_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
+  set(PRELU_OP ON)
+  set(RESIZE_OP ON)
+  set(SCALE_OP ON)
+  set(SLICE_OP ON)
  set(DROPOUT_OP ON)
  set(IM2SEQUENCE_OP ON)
  # option(BATCHNORM_OP "" ON)
@@ -150,6 +155,18 @@ endif()
 if (FUSION_CONVADDBNRELU_OP)
  add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()
+if (PRELU_OP)
+  add_definitions(-DPRELU_OP)
+endif()
+if (RESIZE_OP)
+  add_definitions(-DRESIZE_OP)
+endif()
+if (SCALE_OP)
+  add_definitions(-DSCALE_OP)
+endif()
+if (SLICE_OP)
+  add_definitions(-DSLICE_OP)
+endif()
 if (DROPOUT_OP)
  add_definitions(-DDROPOUT_OP)
 endif()

--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -14,6 +14,10 @@ fi

 # https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
 shift
-perl -i -pe 's|#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
-clang-format -i $@
+perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
+(
+# remove clang format ios_io folder
+flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||')
+clang-format -i $flist
+)
 perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"