udpate src code with remote

1a220755 · qnqinan · 48e8715a · 1a220755 · 1a220755 · 1a220755
369 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
-project(paddle-mobile)
-
-# select the platform to build
-option(CPU "armv7 with neon support" ON)
-option(MALI_GPU "mali gpu support" OFF)
-option(FPGA "fpga support" OFF)
+cmake_minimum_required(VERSION 3.0.0)

-option(USE_OPENMP "openmp support" OFF)
+option(USE_OPENMP "openmp support" ON)
 option(DEBUGING "enable debug mode" ON)
-option(USE_EXCEPTION "use std exception" OFF)
+option(USE_EXCEPTION "use std exception" ON)
+option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io
 option(LOG_PROFILE "log profile" OFF)
+# select the platform to build
+option(CPU "armv7 with neon" ON)
+option(GPU_MALI "mali gpu" OFF)
+option(GPU_CL "opencl gpu" OFF)
+option(FPGA "fpga" OFF)
+if(FPGA)
+    option(FPGAV1 "fpga v1" ON)
+    option(FPGAV2 "fpga v2" OFF)
+endif()
+
+
+project(paddle-mobile)

 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
@@ -29,10 +36,10 @@ if(DEBUGING)
    message(STATUS "debugging mode")
    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
-    if(FPGA)
-    else()
-        add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-    endif()
+endif()
+
+if(SYMBOL_HIDDEN)
+    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
 endif()

 if(USE_EXCEPTION)
@@ -70,7 +77,27 @@ else()
    endforeach()
 endif()

-if(MALI_GPU)
+if (GPU_CL)
+    add_definitions(-DPADDLE_MOBILE_CL)
+
+    # opencl version
+    add_definitions(-DCL_TARGET_OPENCL_VERSION=220)
+
+    link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so)
+    include_directories(third_party/opencl/OpenCL-Headers)
+else()
+    file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()
+
+if (GPU_MALI)
    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
    add_definitions(-DUSE_ACL=1)
    add_definitions(-DUSE_OPENCL)
@@ -96,8 +123,43 @@ else()
 endif()

 if(FPGA)
-    message("FPGA mode enabled")
    add_definitions(-DPADDLE_MOBILE_FPGA)
+    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+    file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+    list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
+    list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
+    if(FPGAV1)
+        message("FPGA_V1 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V1)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
+    if(FPGAV2)
+        message("FPGA_V2 enabled")
+        add_definitions(-DPADDLE_MOBILE_FPGA_V2)
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
+        foreach(f ${_tmp_list})
+            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+        endforeach()
+    endif()
+
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
    foreach(f ${_tmp_list})
@@ -124,17 +186,17 @@ endif()
 if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp)
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
 endif()

 if(IS_IOS)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
-endif()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h)
+endif ()

 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -143,8 +205,10 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)

 # NET default
-if(FPGA)
-    set(NET "FPGAnets" CACHE STRING "select net type")
+if(FPGAV1)
+    set(NET "FPGA_NET_V1" CACHE STRING "select net type")
+elseif(FPGAV2)
+    set(NET "FPGA_NET_V2" CACHE STRING "select net type")
 else()
    set(NET "default" CACHE STRING "select net type")
 endif()

--- a/README.md
+++ b/README.md
@@ -8,46 +8,23 @@
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->


-欢迎来到 Paddle-Mobile GitHub 项目。
-
-Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Mobile设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
-
-## 简单搜索线上效果
-
-如下gif是简单搜索app的线上主体检测应用效果
-
-![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
-
-## Demo目录
-
-[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。

 ## Features

- **ARM CPU**
-
- **Mali GPU**
-
- **苹果设备的GPU Metal实现**
-
- **FPGA**
+- 高性能支持ARM CPU 
+- 支持Mali GPU
+- 支持Andreno GPU
+- 支持苹果设备的GPU Metal实现
+- 支持ZU5、ZU9等FPGA开发板
+- 支持树莓派等arm-linux开发板

-    目前已经支持 ZCU102 开发板。
+## Demo
+- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)

- **灵活性**
-
-    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
-    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
-    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
-    * 使用 docker 编译, 提供统一的编译环境。
-    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
-    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
-
- **体积**
-
-    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
-    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
+### 原Domo目录

+[https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)

 ## 文档

@@ -62,6 +39,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
 * [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
 * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
+* [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)

 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
@@ -73,18 +51,22 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ### 1. 直接使用Paddle Fluid训练
 该方式最为可靠，推荐方式
 ### 2. caffe转为Paddle Fluid模型
-[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+[https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
 ### 3. ONNX
 ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。

 除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。

-目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。
-
-![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
+目前，百度也在做onnx支持工作。相关转换项目在这里：
+[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)

 ### 4. 部分测试模型和测试图片下载
-[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+
+<!--## 简单搜索线上效果
+
+如下gif是简单搜索app的线上主体检测应用效果
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)-->

 ## 问题解决

@@ -96,5 +78,3 @@ Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](L

 ## 旧版 Mobile-Deep-Learning
 原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
-
-
--- a/demo/ReadMe.md
+++ b/demo/ReadMe.md
-## 如何运行demo
- Android demo下载路径   
- http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
- iOS demo下载路径：   
-  http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
-  
-在demo目录下执行下载demo的脚本
+## Demo 下载路径
+- [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip)
+
+- [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip)
+
+- 原demo亦可使用getDemo.sh进行下载
+
 ```
 sh getDemo.sh
 ```
-demo工程就下载解压到当前目录中了。
\ No newline at end of file
--- a/doc/development_android_GPU.md
+++ b/doc/development_android_GPU.md
+## paddle-mobile GPU开发文档
+
+编译环境配置方法请参考development_android.md文档
+
+1. 下载 paddle-mobile
+
+```
+git clone https://github.com/PaddlePaddle/paddle-mobile.git
+
+adb pull /system/vendor/lib/libOpenCL.so paddle-mobile/third_party/opencl
+
+修改paddle-mobile/CMakeLists.txt文件，执行如下操作:
+option(GPU_CL "opencl gpu" OFF)->option(GPU_CL "opencl gpu" ON)
+
+cd paddle-mobile/tools
+
+sh build.sh android
+
+```
+2. 将单测可执行文件和模型部署到手机
+
+下载测试需要的mobilenet和test_image_1x3x224x224_float文件，下载地址：http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip
+
+```
+cd ../test
+mkdir models
+mkdir images
+
+```
+将mobilenet复制到paddle-mobile/test/models目录下
+将test_image_1x3x224x224_float复制到paddle-mobile/test/images目录下
+
+执行下面命令将可执行文件和预测需要的文件部署到手机
+
+```
+cd ../tools/android-debug-script
+sh push2android.sh
+
+```
+3. 在adb shell中执行对应的可执行文件（目前只支持mobilenet，后续会支持更多的网络模型）
+
+```
+adb shell
+cd /data/local/tmp/bin/
+export LD_LIBRARY_PATH=.
+./test-mobilenetgpu
+
+```
+4. mobilenet cpu模型预测结果
+
+假设mobilenet和test_image_1x3x224x224_float文件已经推送到手机上，执行下面命令进行mobilenet cpu的预测
+
+```
+adb shell
+cd /data/local/tmp/bin/
+export LD_LIBRARY_PATH=.
+./test-mobilenet
+
+```
+5. 预测结果
+
+  手机型号：小米6(CPU 835,GPU Adreno 540)
+
+  mobilenet gpu：预测性能，耗时41ms左右。
+
+  mobilenet cpu:
+
+  1线程：108ms
+  2线程：65ms
+  4线程：38ms
+
+  手机型号：OPPO Findx(CPU 845,GPU Adreno 630)
+
+  mobilenet gpu：预测性能，耗时27ms左右。
+
+  mobilenet cpu:
+
+  1线程：90ms
+  2线程：50ms
+  4线程：29ms
+
+
+
+
+
--- a/doc/development_arm_linux.md
+++ b/doc/development_arm_linux.md
+# ARM_LINUX开发文档
+目前支持直接在arm_linux平台上编译paddle-mobile
+
+## 以Raspberrypi3为例：
+### 执行编译
+在paddle-mobile根目录中，执行以下命令：
+```
+cd tools
+/bin/bash build.sh arm_linux googlenet
+```
+执行完毕后，生成的so位于paddle-mobile/build/release/arm-linux/build目录中，单测可执行文件位于test/build目录中。
+
+### 运行
+```
+cd ../build/release/arm-linux/build
+export LD_LIBRARY_PATH=.
+cd ../../../../test/build/
+./test-googlenet
+```
+*注1：如果本地test目录下没有模型的话，会自动下载官方demo模型并解压.*
+
+*注2：因为arm_linux设备算力限制,建议编译时,根据需要指定编译某个模型（如googlenet）或扩大系统的swap交换空间，避免编译时卡死.*
+
+## 其他ARM_LINUX平台
+
+其他的arm_linux平台可以修改 tools/build.sh中的相关编译参数进行编译。可以参考对应平台的编译选项。
+特别说明的是Android平台请参考Android开发文档.
+
--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
 # FPGA开发文档

-FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。
+FPGA平台的代码分为V1和V2。其中V1在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。以下描述适用于复现V1运行的结果。

 ## 准备硬件
 ___
@@ -17,7 +17,7 @@ ___
 ## 编译工程
 ___
 1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
-2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
+2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。设置option(FPGAV1 "fpga v1" ON), option(FPGAV2 "fpga v2" OFF)。
 2. 执行以下命令，可在./test/build下生成test-resnet50可执行程序。
    * mkdir build
    * cd build

--- a/src/common/common.h
+++ b/src/common/common.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include <chrono>
+#include <chrono>  // NOLINT
+
+namespace paddle_mobile {

 using Time = decltype(std::chrono::high_resolution_clock::now());

@@ -25,3 +27,5 @@ inline double time_diff(Time t1, Time t2) {
  ms counter = std::chrono::duration_cast<ms>(diff);
  return counter.count() / 1000.0;
 }
+
+}  // namespace paddle_mobile
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -46,7 +46,8 @@ struct PaddleMobileException : public std::exception {
    std::string detail(buffer);                                            \
    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
                                               __FILE__, __LINE__);        \
-  }
+  }                                                                        \
+  exit(0);

 #define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
  {                                                                           \

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -82,6 +82,7 @@ std::unordered_map<
        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -39,7 +39,13 @@ struct PrecisionTrait<Precision::FP16> {
 };

 //! device type
-enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+enum DeviceTypeEnum {
+  kINVALID = -1,
+  kCPU = 0,
+  kFPGA = 1,
+  kGPU_MALI = 2,
+  kGPU_CL = 3
+};

 template <DeviceTypeEnum T>
 struct DeviceType {};
@@ -47,6 +53,7 @@ struct DeviceType {};
 typedef DeviceType<kCPU> CPU;
 typedef DeviceType<kFPGA> FPGA;
 typedef DeviceType<kGPU_MALI> GPU_MALI;
+typedef DeviceType<kGPU_CL> GPU_CL;

 //! data type
 enum DataType {

--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/api.h"
+#include "fpga/V1/api.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <algorithm>
 #include <map>
-#include "fpga/bias_scale.h"
-#include "fpga/filter.h"
-#include "fpga/image.h"
+#include "fpga/V1/bias_scale.h"
+#include "fpga/V1/filter.h"
+#include "fpga/V1/image.h"
 #define FPGA_TEST_MODE
 #define PADDLE_MOBILE_OS_LINUX


--- a/src/fpga/api.h
+++ b/src/fpga/api.h
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/bias_scale.h"
+#include "fpga/V1/bias_scale.h"
 #include <memory.h>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/bias_scale.h
+++ b/src/fpga/bias_scale.h
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/filter.h"
+#include "fpga/V1/filter.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/filter.h
+++ b/src/fpga/filter.h
--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "fpga/image.h"
+#include "fpga/V1/image.h"
 #include <memory.h>
 #include <algorithm>
-#include "fpga/api.h"
+#include "fpga/V1/api.h"

 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/image.h
+++ b/src/fpga/image.h
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/api.h"
+#include <algorithm>
+#include "fpga/V2/bias_scale.h"
+#include "fpga/V2/config.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+
+namespace paddle_mobile {
+namespace fpga {
+static std::map<void *, size_t> memory_map;
+
+int open_device() {
+  int ret = open_device_driver();
+  return ret;
+}
+
+int close_device() {
+  int ret = close_device_driver();
+  return ret;
+}
+
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+#ifdef PADDLE_MOBILE_ZU5
+  auto ptr = fpga_malloc_driver(size);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+  //       << counter << " bytes";
+  return ptr;
+}
+
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_ZU5
+    fpga_free_driver(ptr);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+    //         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
+half fp32_2_fp16(float fp32_num) {
+  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
+  auto t = (half)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
+                  (((tmp & 0x7f800000) >> 13) - (112 << 10)));
+  if (tmp & 0x1000) {
+    t++;  // roundoff
+  }
+  return t;
+}
+
+float fp16_2_fp32(half fp16_num) {
+  int frac = (fp16_num & 0x3ff);
+  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
+  int s = fp16_num & 0x8000;
+  int tmp = 0;
+  float fp32_num;
+  tmp = s << 16 | exp << 23 | frac << 13;
+  fp32_num = *(float *)&tmp;  // NOLINT
+  return fp32_num;
+}
+
+void format_image(framework::Tensor *image_tensor) {
+  auto dims = image_tensor->dims();
+  auto channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = image_tensor->data<float>();
+  size_t memory_size = channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  int aligned_channel = filter::calc_aligned_channel((int)channel);  // NOLINT
+  image::format_image(&new_data, (int)channel, (int)height,          // NOLINT
+                      (int)width,                                    // NOLINT
+                      aligned_channel);
+  image_tensor->reset_data_ptr(new_data);
+}
+
+void format_fp16_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = (height + 1) / 2 * 2 * width * aligned_channel * sizeof(half);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(half);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+void format_fp32_ofm(framework::Tensor *ofm_tensor, int aligned_channel) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto height = dims[2], width = dims[3];
+    memory_size = height * width * aligned_channel * sizeof(float);
+  } else if (dims.size() == 2) {
+    memory_size = aligned_channel * sizeof(float);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+float filter_find_max(framework::Tensor *filter_tensor) {
+  auto filter_ptr = filter_tensor->data<float>();
+  return filter::find_max(filter_ptr, (int)filter_tensor->numel());  // NOLINT
+}
+
+int get_aligned_channel_num(int channel_num) {
+  return filter::calc_aligned_channel(channel_num);
+}
+
+int get_aligned_filter_num(framework::Tensor *filter_tensor) {
+  auto dims = filter_tensor->dims();
+  return filter::calc_aligned_num((int)dims[0], (int)dims[1]);  // NOLINT
+}
+
+int get_conv_output_channel(framework::Tensor *filter_tensor) {
+  int aligned_filter_num = get_aligned_filter_num(filter_tensor);
+  return get_aligned_channel_num(aligned_filter_num);
+}
+void format_filter(framework::Tensor *filter_tensor, float max_value,
+                   int group_num) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  filter::format_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                        (int)height,                        // NOLINT
+                        (int)width, group_num, max_value);  // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+  filter::format_fc_filter(&new_data, (int)num, (int)channel,  // NOLINT
+                           (int)height,                        // NOLINT
+                           (int)width, 1, max_value);          // NOLINT
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_bias_scale_array(float **bias_scale_array, int filter_num,
+                             int filter_channel) {
+  int num_after_alignment =
+      filter::calc_aligned_num(filter_channel, filter_channel);
+  bias_scale::format_bias_scale_array(bias_scale_array, filter_num,
+                                      num_after_alignment);
+}
+
+void format_concat_output(framework::Tensor *out, int height, int width,
+                          uint32_t out_channel) {
+  auto data_ptr = fpga_malloc(out_channel * height * width * sizeof(half));
+  auto ddim = framework::make_ddim({1, out_channel, height, width});
+  out->Resize(ddim);
+  out->reset_data_ptr(data_ptr);
+}
+
+int format_conv_data(framework::Tensor *filter_tensor,
+                     framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_filter(filter_tensor, max_value, group);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+
+int format_fc_data(framework::Tensor *filter_tensor,
+                   framework::Tensor *ofm_tensor, float *bs_ptr) {
+  float max_value = fpga::filter_find_max(filter_tensor);
+  fpga::format_fc_filter(filter_tensor, max_value);
+  int aligned_num = get_aligned_filter_num(filter_tensor);
+  fpga::format_bias_scale_array(&bs_ptr,
+                                (int)filter_tensor->dims()[0],  // NOLINT
+                                aligned_num);
+  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
+  fpga::format_fp16_ofm(ofm_tensor, aligned_channel);
+  DLOG << aligned_channel;
+  return aligned_channel;
+}
+
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
+                    framework::Tensor *out, framework::Tensor *filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
+  auto input_ptr = input->data<float>();
+  auto filter_ptr = filter->data<float>();
+  auto out_ptr = out->data<float>();
+
+  arg->group_num = (uint32_t)group_num;
+  arg->split_num = 1;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+  arg->conv_args =
+      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
+
+  arg->concat_arg.image_num = arg->split_num;
+  arg->concat_arg.image_out = out_ptr;
+  arg->concat_arg.scale_out = out->scale;
+  arg->concat_arg.height = (uint32_t)out->dims()[2];
+  arg->concat_arg.width = (uint32_t)out->dims()[3];
+
+  int n = arg->split_num;
+  arg->concat_arg.images_in =
+      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
+  arg->concat_arg.scales_in =
+      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
+  arg->concat_arg.channel_num =
+      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
+
+  for (int i = 0; i < n; i++) {
+    arg->conv_args[i].relu_enabled = relu_enabled;
+    arg->conv_args[i].sb_address = bs_ptr;
+    arg->conv_args[i].filter_address = (int8_t *)filter_ptr;  // NOLINT
+    arg->conv_args[i].filter_scale_address = filter->scale;
+    arg->conv_args[i].filter_num = arg->filter_num;
+    arg->conv_args[i].group_num = (uint32_t)group_num;
+
+    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
+
+    arg->conv_args[i].image.address = input_ptr;
+    arg->conv_args[i].image.scale_address = input->scale;
+    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
+
+    arg->conv_args[i].output.address = out_ptr;
+    arg->conv_args[i].output.scale_address = out->scale;
+  }
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/driver/pe.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+  uint64_t timer_cnt;    // time counter for FPGA computation
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias are interlaced;
+  void* filter_address;
+  float* filter_scale_address;
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct ConcatArgs {
+  uint32_t image_num;
+  half** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t* aligned_channel_num;
+  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+
+struct PoolingArgs {
+  int16_t mode;  // mode: 0:max, 1:avg
+  half kernel_reciprocal;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct EWAddArgs {
+  bool relu_enabled;
+
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+int open_device();
+int close_device();
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+
+float filter_find_max(framework::Tensor* filter_tensor);
+int get_aligned_channel_num(int channel_num);
+int get_aligned_filter_num(framework::Tensor* filter_tensor);
+int get_conv_output_channel(framework::Tensor* filter_tensor);
+
+void format_image(framework::Tensor* image_tensor);
+void format_fp16_ofm(framework::Tensor* ofm_tensor,
+                     int aligned_channel);  // only allocate memory
+void format_fp32_ofm(framework::Tensor* ofm_tensor, int aligned_channel);
+
+void format_filter(framework::Tensor* filter_tensor, float max_value,
+                   int group_num);
+void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
+void format_bias_scale_array(float** bias_scale_array, int filter_num,
+                             int filter_channel);
+void format_concat_output(framework::Tensor* out, int height, int width,
+                          uint32_t out_channel);
+int format_conv_data(framework::Tensor* filter_tensor,
+                     framework::Tensor* ofm_tensor, float* bs_ptr, int group);
+int format_fc_data(framework::Tensor* filter_tensor,
+                   framework::Tensor* ofm_tensor, float* bs_ptr);
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
+                    framework::Tensor* out, framework::Tensor* filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
+
+half fp32_2_fp16(float fp32_num);
+float fp16_2_fp32(half fp16_num);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.cpp
+++ b/src/fpga/V2/bias_scale.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/bias_scale.h"
+#include <memory.h>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float **data_in, int num, int num_after_alignment) {
+  float *ptr_unaligned = *data_in;
+  int total_element = 2 * num_after_alignment;  // including bias & scale
+  float *ptr_aligned =
+      (float *)fpga_malloc(total_element * sizeof(float));  // NOLINT
+  memset(ptr_aligned, 0, total_element * sizeof(float));
+
+  for (int i = 1; i < num; i++) {
+    ptr_aligned[i * 2 + 0] = ptr_unaligned[i];
+    ptr_aligned[i * 2 + 1] = ptr_unaligned[i + num];
+  }
+
+  fpga_free(ptr_unaligned);
+  *data_in = ptr_aligned;
+}
+
+void format_bias_scale_array(float **data_in, int num,
+                             int num_after_alignment) {
+  align_element(data_in, num, num_after_alignment);
+}
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/bias_scale.h
+++ b/src/fpga/V2/bias_scale.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float **data_in, int num, int num_after_alignment);
+void format_bias_scale_array(float **data_in, int num, int num_after_alignment);
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/config.h
+++ b/src/fpga/V2/config.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define PADDLE_MOBILE_ZU5
+#define FPGA_PRINT_MODE
--- a/src/fpga/V2/driver/bitmap.cpp
+++ b/src/fpga/V2/driver/bitmap.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/driver/bitmap.h"
+
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+  while (len - bits_to_set >= 0) {
+    *p |= mask_to_set;
+    len -= bits_to_set;
+    bits_to_set = BITS_PER_LONG;
+    mask_to_set = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+    *p |= mask_to_set;
+  }
+}
+
+void bitmap_clear(uint64_t *map, unsigned int start, int len) {
+  uint64_t *p = map + BIT_WORD(start);
+  const unsigned int size = start + len;
+  int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+  uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+  while (len - bits_to_clear >= 0) {
+    *p &= ~mask_to_clear;
+    len -= bits_to_clear;
+    bits_to_clear = BITS_PER_LONG;
+    mask_to_clear = ~0UL;
+    p++;
+  }
+  if (len) {
+    mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+    *p &= ~mask_to_clear;
+  }
+}
+
+static uint64_t ffs(uint64_t data) {
+  uint64_t bit = 0;
+  int i = 0;
+
+  for (i = 0; i < sizeof(data); i++) {
+    if (data & (1 << i)) {
+      bit = i;
+      break;
+    }
+  }
+
+  return bit;
+}
+
+static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits,
+                               uint64_t start, uint64_t invert) {
+  uint64_t tmp = 0;
+
+  if (!nbits || start >= nbits) return nbits;
+
+  tmp = addr[start / BITS_PER_LONG] ^ invert;
+
+  /* Handle 1st word. */
+  tmp &= BITMAP_FIRST_WORD_MASK(start);
+  start = round_down(start, BITS_PER_LONG);
+
+  while (!tmp) {
+    start += BITS_PER_LONG;
+    if (start >= nbits) return nbits;
+
+    tmp = addr[start / BITS_PER_LONG] ^ invert;
+  }
+
+  return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits;
+}
+
+uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size,
+                            uint64_t offset) {
+  return _find_next_bit(addr, size, offset, ~0UL);
+}
+
+uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) {
+  return _find_next_bit(addr, size, offset, 0UL);
+}
+
+uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size,
+                                        uint64_t start, unsigned int nr,
+                                        uint64_t align_mask,
+                                        uint64_t align_offset) {
+  uint64_t index = 0;
+  uint64_t end = 0;
+  uint64_t i = 0;
+
+again:
+  index = find_next_zero_bit(map, size, start);
+
+  /* Align allocation */
+  index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;
+
+  end = index + nr;
+  if (end > size) return end;
+  i = find_next_bit(map, end, index);
+  if (i < end) {
+    start = i + 1;
+    goto again;
+  }
+
+  return index;
+}
+
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask) {
+  return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0);
+}
+}  // namespace fpga_bitmap
--- a/src/fpga/V2/driver/bitmap.h
+++ b/src/fpga/V2/driver/bitmap.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#define BITS_PER_LONG 64
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
+
+#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
+
+#define round_down(x, y) ((x) & ((y)-1))
+
+namespace fpga_bitmap {
+void bitmap_set(uint64_t *map, unsigned int start, int len);
+void bitmap_clear(uint64_t *map, unsigned int start, int len);
+uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size,
+                                    uint64_t start, unsigned int nr,
+                                    uint64_t align_mask);
+
+}  // namespace fpga_bitmap
--- a/src/fpga/V2/driver/driver.cpp
+++ b/src/fpga/V2/driver/driver.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+
+#include "common/enforce.h"
+#include "fpga/V2/driver/bitmap.h"
+#include "fpga/V2/driver/driver.h"
+
+namespace paddle_mobile {
+namespace fpga {
+struct FPGA_INFO g_fpgainfo;
+
+int open_drvdevice() {
+  if (g_fpgainfo.fd_drv == -1) {
+    g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
+  }
+  return g_fpgainfo.fd_drv;
+}
+
+int open_memdevice() {
+  if (g_fpgainfo.fd_mem == -1) {
+    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
+  }
+  return g_fpgainfo.fd_mem;
+}
+
+void pl_reset() {
+  // DLOG << "PL RESET";
+
+  // reg_writeq(0x5a, REG_FPGA_RESET);
+  usleep(100 * 1000);
+}
+
+void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
+              char const *type_name, int pe_idx) {
+  memset(pe, 0, sizeof(struct fpga_pe));
+
+  pe->outer = pe_data;
+  snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
+
+  pe->status = IDLE;
+  pe->interrupt_cnt = 0;
+  pe_data->pes[pe_idx] = pe;
+  pe_data->pe_num++;
+}
+
+void pl_init() {
+  struct pe_data_s *pe_data = nullptr;
+
+  pl_reset();
+
+  pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
+  if (pe_data == nullptr) {
+    DLOG << "pe_data malloc error!";
+    return;
+  }
+  memset(pe_data, 0, sizeof(struct pe_data_s));
+  pthread_mutex_init(&pe_data->mutex, 0);
+
+  setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
+  setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
+  setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
+  setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
+
+  g_fpgainfo.pe_data = pe_data;
+}
+
+void pl_destroy() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  pthread_mutex_destroy(&pe_data->mutex);
+  free(pe_data);
+}
+
+void pl_start() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+
+  pthread_mutex_unlock(&pe_data->mutex);
+}
+
+void pl_stop() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+
+  pthread_mutex_lock(&pe_data->mutex);
+}
+
+void pl_reinit() {
+  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
+  struct fpga_pe *pe = nullptr;
+  int i = 0;
+
+  pl_stop();
+  pl_reset();
+  pl_start();
+
+  for (i = 0; i < pe_data->pe_num; i++) {
+    pe = pe_data->pes[i];
+    pe->status = IDLE;
+    pe->interrupt_cnt = 0;
+  }
+
+  pl_start();
+}
+
+int pl_get_status() { return 0; }
+
+/*tmie单位us*/
+int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
+  uint64_t i = 0;
+  /*timeout精确性待确认*/
+  int64_t timeout = time * CPU_FREQ / 1000000;
+
+  for (i = 0; i < timeout; i++) {
+    if (val == reg_readq(reg)) {
+      break;
+    }
+  }
+
+  if (i <= timeout) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+/*内存管理*/
+int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
+  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
+  unsigned int nr = (unsigned int)_nr;
+  int ret = 0;
+
+  pthread_mutex_lock(&memory->mutex);
+
+  unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area(
+      memory->bitmap, memory->page_num, 0, nr, 0);
+  if (pos <= memory->page_num) {
+    uint64_t address_ofset =
+        memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE;
+    fpga_bitmap::bitmap_set(memory->bitmap, pos, nr);
+    memory->nr[pos] = nr;
+
+    *addr = address_ofset;
+  } else {
+    ret = -ENOMEM;
+  }
+
+  pthread_mutex_unlock(&memory->mutex);
+
+  return ret;
+}
+
+void memory_release(struct fpga_memory *memory) {
+  pthread_mutex_lock(&memory->mutex);
+  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
+  pthread_mutex_unlock(&memory->mutex);
+}
+
+int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) {
+  int rc = 0;
+
+  uint64_t *bitmap = nullptr;
+  unsigned int *nr = nullptr;
+
+  // 不允许多份memory创建，所以创建memory结构体不存在互斥
+  // pthread_mutex_lock(&memory->mutex);
+  memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE);
+  memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG);
+
+  bitmap =
+      (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long);  // NOLINT
+  if (!bitmap) {
+    rc = -EFAULT;
+    return rc;
+  }
+  memory->bitmap = bitmap;
+
+  nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int));
+  if (!nr) {
+    rc = -EFAULT;
+    free(bitmap);
+    return rc;
+  }
+  memory->nr = nr;
+
+  memory->mem_start = FPGA_MEM_PHY_ADDR;
+  memory->mem_end = FPGA_MEM_SIZE;
+  // pthread_mutex_unlock(memory->mutex);
+
+  return rc;
+}
+
+int create_fpga_memory(struct fpga_memory **memory_info) {
+  int rc = 0;
+
+  *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory));
+  if (*memory_info == NULL) {
+    rc = -EFAULT;
+    return rc;
+  }
+  pthread_mutex_init(&((*memory_info)->mutex), nullptr);
+
+  rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE);
+  if (rc) {
+    free(*memory_info);
+  }
+
+  return rc;
+}
+
+int init_fpga_memory(struct fpga_memory *memory) {
+  int rc = 0;
+
+  if (!memory) {
+    rc = -EFAULT;
+    return rc;
+  }
+
+  // spin_lock_init(&memory->spin);
+  fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num);
+  fpga_bitmap::bitmap_set(memory->bitmap, 0, 1);  // NOTE reserve fpga page 0.
+
+  return 0;
+}
+
+void destroy_fpga_memory(struct fpga_memory *memory) {
+  if (memory) {
+    free(memory->nr);
+    free(memory->bitmap);
+    free(memory);
+  }
+}
+
+int fpga_memory_add() {
+  int rc = 0;
+
+  rc = create_fpga_memory(&g_fpgainfo.memory_info);
+  if (rc) {
+    return rc;
+  }
+
+  rc = init_fpga_memory(g_fpgainfo.memory_info);
+  if (rc) {
+    destroy_fpga_memory(g_fpgainfo.memory_info);
+    return rc;
+  }
+
+  return 0;
+}
+
+uint64_t vaddr_to_paddr(void *address) {
+  uint64_t paddr = 0;
+  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
+  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
+    paddr = iter->second;
+  } else {
+    DLOG << "Invalid pointer";
+  }
+
+  return paddr;
+}
+
+void *fpga_reg_malloc(size_t size) {
+  void *ret = nullptr;
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
+  // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+
+  return ret;
+}
+
+void *fpga_malloc_driver(size_t size) {
+  void *ret = nullptr;
+  uint64_t phy_addr = 0;
+
+  memory_request(g_fpgainfo.memory_info, size, &phy_addr);
+
+  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+               g_fpgainfo.fd_mem, phy_addr);
+  PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
+
+  g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
+  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
+
+  return ret;
+}
+
+void fpga_free_driver(void *ptr) {
+  size_t size = 0;
+
+  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
+  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
+    size = iter->second;
+    g_fpgainfo.fpga_addr2size_map.erase(iter);
+    munmap(ptr, size);
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
+int open_device_driver() {
+  g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
+  g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
+  g_fpgainfo.FpgaRegVirAddr = nullptr;
+  g_fpgainfo.pe_data = nullptr;
+  g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
+  g_fpgainfo.memdevice_path = "/dev/fpgamem0";
+  g_fpgainfo.fd_drv = -1;
+  g_fpgainfo.fd_mem = -1;
+
+  int ret = 0;
+  ret = open_drvdevice();
+  ret |= open_memdevice();
+
+  g_fpgainfo.FpgaRegVirAddr =
+      (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
+  fpga_memory_add();
+
+  pl_init();
+
+  return ret;
+}
+
+int close_device_driver() {
+  pl_destroy();
+  fpga_free_driver(g_fpgainfo.FpgaRegVirAddr);
+  memory_release(g_fpgainfo.memory_info);
+  destroy_fpga_memory(g_fpgainfo.memory_info);
+
+  return 0;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/driver.h
+++ b/src/fpga/V2/driver/driver.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cstring>
+#include <map>
+
+#include "common/log.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
+
+#define FPGA_REG_PHY_ADDR 0xa0000000
+#define FPGA_REG_SIZE 0x1000
+#define FPGA_MEM_PHY_ADDR 0x20000000
+#define FPGA_MEM_SIZE 0x20000000
+
+#define CPU_FREQ 1000000000
+
+#define FPGA_PAGE_SIZE (16UL * 1024UL)
+
+// PE related macros
+const int MAX_NUM_PES = 6;
+const size_t MAX_TYPE_NAME_LENTH = 8;
+
+const int PE_IDX_CONV = 0;
+const int PE_IDX_POOLING = 1;
+const int PE_IDX_EW = 2;
+const int PE_IDX_BYPASS = 3;
+
+enum pe_status { IDLE = 0, BUSY = 1 };
+
+struct fpga_pe {
+  char type_name[MAX_TYPE_NAME_LENTH + 1];
+  struct pe_data_s *outer;
+  pe_status status;  // 0=idle 1=busy -1=fail
+  uint64_t interrupt_cnt;
+};
+
+struct pe_data_s {
+  pthread_mutex_t mutex;
+  struct fpga_pe pe_conv;
+  struct fpga_pe pe_pooling;
+  struct fpga_pe pe_ew;
+  struct fpga_pe pe_bypass;
+
+  struct fpga_pe *pes[MAX_NUM_PES];
+  int pe_num;
+};
+
+struct fpga_memory {
+  pthread_mutex_t mutex;
+  uint64_t *bitmap;
+  unsigned int *nr;
+  unsigned int page_num;
+  unsigned int page_num_long;
+  uint64_t mem_start;
+  uint64_t mem_end;
+};
+
+struct FPGA_INFO {
+  uint64_t FpgaRegPhyAddr;
+  uint64_t FpgaMemPhyAddr;
+  pthread_t poll_pid;
+  void *FpgaRegVirAddr;
+  struct pe_data_s *pe_data;
+
+  std::map<void *, size_t> fpga_addr2size_map;
+  std::map<void *, uint64_t> fpga_vaddr2paddr_map;
+  const char *drvdevice_path;
+  const char *memdevice_path;
+  struct fpga_memory *memory_info;
+  int fd_drv;
+  int fd_mem;
+};
+
+extern struct FPGA_INFO g_fpgainfo;
+
+inline uint64_t reg_readq(uint32_t offset) {
+  // DLOG << "offset : " << offset;
+  uint64_t value =
+      *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset);  // NOLINT
+
+  return value;
+}
+
+inline void reg_writeq(uint64_t value, uint32_t offset) {
+  // DLOG << "offset : " << offset << ", value : " << value;
+  *(uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + offset) =  // NOLINT
+      value;
+}
+
+int open_device_driver();
+int close_device_driver();
+void *fpga_malloc_driver(size_t size);
+void fpga_free_driver(void *ptr);
+/*pe*/
+
+uint64_t vaddr_to_paddr(void *address);
+int fpga_regpoll(uint64_t reg, uint64_t val, int time);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/driver/pe.h"
+#include "fpga/V2/config.h"
+#include "fpga/V2/driver/driver.h"
+#include "fpga/V2/filter.h"
+#include "fpga/V2/image.h"
+
+namespace paddle_mobile {
+namespace fpga {
+#define MUL8(x) (x * 8)
+#define BYPASS_DONE 1
+
+float Findfp16Max() {
+  uint16_t abs_vals[16];
+  uint64_t max_fp16;
+
+  max_fp16 = reg_readq(MUL8(49));
+  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(50));
+  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(51));
+  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
+  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
+  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(52));
+  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
+  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+
+  uint16_t tmp = 0;
+  for (int i = 0; i < 16; i++) {
+    if (tmp < abs_vals[i]) {
+      tmp = abs_vals[i];
+    }
+  }
+  return fp16_2_fp32(tmp) / 127.0f;
+}
+
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
+  ComputeBasicConv(args.conv_args[0]);
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  return 0;
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   mode:" << args.mode
+       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+  return 0;
+}
+
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+#ifndef PADDLE_MOBILE_ZU5
+  return 0;
+#endif
+
+  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t bp_enable;
+  int64_t length;
+  uint64_t pixels;
+
+  // fp32->fp16
+  if ((args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(float);
+    bp_enable = 0x8800000000000000 + length;
+  }
+  // fp16->fp32
+  else if ((!args.input_data_type) && (args.output_data_type)) {
+    pixels = filter::calc_aligned_channel((args.image.channels)) *
+             (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    length = align_to_x((int)length, 64);  // NOLINT
+    bp_enable = 0x8a00000000000000 + length;
+  }
+  // fp16->fp16 findmax
+  else if ((!args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    bp_enable = 0x8900000000000000 + length;
+  } else {
+    return -1;
+  }
+
+  // start bypass
+  reg_writeq(ifm_src_paddr, MUL8(27));
+  reg_writeq(ifm_dst_paddr, MUL8(28));
+  reg_writeq(0, MUL8(0));
+  reg_writeq(bp_enable, MUL8(0));
+  // poll
+  int ret = -1;
+  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
+  if (ret != -1) {
+    // clear "irq"
+    reg_readq(MUL8(63));
+  }
+  // get max value
+  if ((!args.input_data_type) && (!args.output_data_type)) {
+    float scale = Findfp16Max();
+    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+    args.output.scale_address[1] = scale;
+  }
+  return ret;
+}
+
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width, args.aligned_channel_num,
+                       args.out_channel);
+  return 0;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/driver/pe.h
+++ b/src/fpga/V2/driver/pe.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+int PerformBypass(const struct BypassArgs& args);
+int ComputeBasicConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+
+int ComputeFpgaConv(const struct SplitConvArgs& args);
+int ComputeFPGAConcat(const struct ConcatArgs& args);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/filter.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_channel_parallelism(int channel) {
+  if (channel <= 16) {
+    return 16;
+  } else if (channel <= 32) {
+    return 32;
+  } else if (channel <= 64) {
+    return 64;
+  } else {
+    return 128;
+  }
+}
+int calc_aligned_channel(int channel) {
+  return align_to_x(channel, calc_channel_parallelism(channel));
+}
+
+int calc_num_parallelism(int channel) {
+  return FILTER_PARALLELISM / calc_channel_parallelism(channel);
+}
+
+int calc_aligned_num(int num, int channel) {
+  return align_to_x(num, calc_num_parallelism(channel));
+}
+
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width) {
+  int aligned_channel = calc_aligned_channel(channel);
+  int aligned_filter_num = calc_aligned_num(num, channel);
+  return aligned_filter_num * aligned_channel * height * width;
+}
+
+void convert_to_hwc(float **data_in, int num, int channel, int height,
+                    int width) {
+  float *tmp = *data_in;
+  int chw = channel * height * width;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * chw + offset_height + w * channel + c) =
+              *((*data_in)++);
+        }
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void align_filter(float **data_in, int num, int channel, int height,
+                  int width) {
+  int aligned_channel = calc_channel_parallelism(channel);
+  int hw = height * width;
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  float *new_data = (float *)fpga_malloc(pixel_num * sizeof(float));  // NOLINT
+  float *temp = *data_in;
+  memset(new_data, 0, pixel_num * sizeof(float));
+  for (int i = 0; i < num; i++) {
+    for (int j = 0; j < hw; j++) {
+      memcpy(new_data + i * aligned_channel * hw + j * aligned_channel,
+             temp + i * channel * hw + j * channel, channel * sizeof(float));
+    }
+  }
+  *data_in = new_data;
+  fpga_free(temp);
+}
+
+void format_filter(float **data_in, int num, int channel, int height, int width,
+                   int group_num, float max) {
+  convert_to_hwc(data_in, num, channel, height, width);
+  align_filter(data_in, num, channel, height, width);
+  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+}
+
+void convert_fc_filter(float **data_in, int num, int chw) {
+  float *tmp = *data_in;
+  float *data_tmp = (float *)fpga_malloc(chw * num * sizeof(float));  // NOLINT
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < chw; c++) {
+      data_tmp[n * chw + c] = (*data_in)[num * c + n];
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void format_fc_filter(float **data_in, int num, int channel, int height,
+                      int width, int group_num, float max) {
+  int chw = channel * height * width;
+  convert_fc_filter(data_in, num, chw);
+  align_filter(data_in, num, channel, height, width);
+}
+
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+signed char float_to_int8(float fdata) {
+  if (fdata < 0.0) {
+    fdata -= 0.5;
+  } else {
+    fdata += 0.5;
+  }
+  return (signed char)fdata;
+}
+
+void quantize(float **data_in, int data_size, float max) {
+  float *tmp = *data_in;
+  float fix_range = 127;
+  float scale = fix_range / max;
+
+  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = float_to_int8(
+        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
+
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/filter.h
+++ b/src/fpga/V2/filter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define FILTER_PARALLELISM 1024
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_channel_parallelism(int channel);
+int calc_aligned_channel(int channel);
+int calc_num_parallelism(int channel);
+int calc_aligned_num(int num, int channel);
+int calc_aligned_total_pixel_num(int num, int channel, int height, int width);
+void convert_to_hwc(float** data_in, int num, int channel, int height,
+                    int width);
+void format_filter(float** data_in, int num, int channel, int height, int width,
+                   int group_num, float max);
+void convert_fc_filter(float** data_in, int num, int chw);
+void format_fc_filter(float** data_in, int num, int channel, int height,
+                      int width, int group_num, float max);
+float find_max(float* data_in, int data_size);
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/image.cpp
+++ b/src/fpga/V2/image.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V2/image.h"
+#include <memory.h>
+#include <algorithm>
+#include "fpga/V2/api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width) {
+  float *tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
+  int64_t amount_per_row = width * channel;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int64_t offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel) {
+  if (channel == aligned_channel) return;
+  float *tmp = *data_in;
+  float *new_data =
+      (float *)fpga_malloc(aligned_channel * height * width *  // NOLINT
+                           sizeof(float));                     // NOLINT
+  memset(new_data, 0, aligned_channel * height * width * sizeof(float));
+
+  for (int i = 0; i < height * width; i++) {
+    memcpy(new_data + i * aligned_channel, tmp + i * channel,
+           channel * sizeof(float));
+  }
+  *data_in = new_data;
+  fpga_free(tmp);
+}
+
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel) {
+  convert_to_hwc(data_in, channel, height, width);
+  align_image(data_in, channel, height, width, aligned_channel);
+}
+
+void concat_images(int16_t **images_in, float **scales_in, void *image_out,
+                   float *scale_out, int image_num, const uint32_t *channel_num,
+                   int height, int width, const uint32_t *aligned_channel_num,
+                   int out_channel) {
+  int hw = height * width;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
+  for (int i = 0; i < image_num; i++) {
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+  }
+  scale_out[1] = 1 / scale_out[0];
+
+  for (int j = 0; j < hw; j++) {
+    int tmp_channel_sum = 0;
+    for (int i = 0; i < image_num; i++) {
+      memcpy(
+          (int16_t *)image_out + j * out_channel + tmp_channel_sum,  // NOLINT
+          images_in[i] + j * aligned_channel_num[i],
+          channel_num[i] * sizeof(int16_t));
+
+      tmp_channel_sum += channel_num[i];
+    }
+  }
+}
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V2/image.h
+++ b/src/fpga/V2/image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width);
+void align_image(float **data_in, int channel, int height, int width,
+                 int aligned_channel);
+void format_image(float **data_in, int channel, int height, int width,
+                  int aligned_channel);
+void concat_images(
+    int16_t **images_in, float **scales_in, void *image_out, float *scale_out,
+    int image_num, const uint32_t *channel_num, int height, int width,
+    const uint32_t *aligned_channel_num,
+    int out_channel);  // Concat featuremaps along channel direction
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -117,9 +117,9 @@ class Attribute {

  template <typename Vistor>
  static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
-    if (attr.variant_.TypeId() == typeid(int).hash_code()) {
+    if (attr.variant_.TypeId() == typeid(int).hash_code()) {  // NOLINT
      return vistor(attr.variant_.Get<int>());
-    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {  // NOLINT
      return vistor(attr.variant_.Get<float>());
    } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
      return vistor(attr.variant_.GetString());
@@ -129,7 +129,7 @@ class Attribute {
      return vistor(attr.variant_.Get<vector<float>>());
    } else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
      return vistor(attr.variant_.Get<vector<string>>());
-    } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {  // NOLINT
      return vistor(attr.variant_.Get<bool>());
    } else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
      return vistor(attr.variant_.Get<vector<bool>>());
@@ -137,7 +137,6 @@ class Attribute {
      return vistor(attr.variant_.Get<int64_t>());
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
-      exit(0);
    }
  }


--- a/src/framework/cl/cl_deleter.h
+++ b/src/framework/cl/cl_deleter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "CL/cl.h"
+
+struct CLKernelDeleter {
+  template <class T>
+  void operator()(T *clKernelObj) {
+    clReleaseKernel(clKernelObj);
+  }
+};
+
+struct CLMemDeleter {
+  template <class T>
+  void operator()(T *clMemObj) {
+    clReleaseMemObject(clMemObj);
+  }
+};
+
+struct CLEventDeleter {
+  template <class T>
+  void operator()(T *clEventObj) {
+    clReleaseEvent(clEventObj);
+  }
+};
+
+struct CLCommQueueDeleter {
+  template <class T>
+  void operator()(T *clQueueObj) {
+    clReleaseCommandQueue(clQueueObj);
+  }
+};
+
+struct CLContextDeleter {
+  template <class T>
+  void operator()(T *clContextObj) {
+    clReleaseContext(clContextObj);
+  }
+};
+
+struct CLProgramDeleter {
+  template <class T>
+  void operator()(T *clProgramObj) {
+    clReleaseProgram(clProgramObj);
+  }
+};
--- a/src/framework/cl/cl_engine.cpp
+++ b/src/framework/cl/cl_engine.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_engine.h"
+#include "CL/cl.h"
+#include "framework/cl/cl_tool.h"
+
+#include <cstdlib>
+#include <cstring>
+
+namespace paddle_mobile {
+namespace framework {
+
+bool CLEngine::Init() {
+  if (initialized_) {
+    return true;
+  }
+  cl_int status;
+  SetPlatform();
+  SetClDeviceId();
+
+  initialized_ = true;
+  return initialized_;
+  //  setClCommandQueue();
+  //  std::string filename = "./HelloWorld_Kernel.cl";
+  //  loadKernelFromFile(filename.c_str());
+  //  buildProgram();
+}
+
+CLEngine *CLEngine::Instance() {
+  static CLEngine cl_engine_;
+  cl_engine_.Init();
+  return &cl_engine_;
+}
+
+bool CLEngine::SetPlatform() {
+  platform_ = NULL;      // the chosen platform
+  cl_uint numPlatforms;  // the NO. of platforms
+  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+  /**For clarity, choose the first available platform. */
+  if (numPlatforms > 0) {
+    cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
+        malloc(numPlatforms * sizeof(cl_platform_id)));
+    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    platform_ = platforms[0];
+    free(platforms);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool CLEngine::SetClDeviceId() {
+  cl_uint numDevices = 0;
+  devices_ = NULL;
+  cl_int status =
+      clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+
+  if (numDevices > 0) {
+    devices_ = reinterpret_cast<cl_device_id *>(
+        malloc(numDevices * sizeof(cl_device_id)));
+    status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
+                            NULL);
+    return true;
+  }
+  return false;
+}
+
+// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
+//    const std::string &kernel_name) {
+//  std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
+//      clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
+//  return std::move(kernel);
+//}
+//
+// bool CLEngine::SetClCommandQueue() {
+//  cl_int status;
+//  command_queue_.reset(
+//          clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
+//  return true;
+//}
+
+// bool CLEngine::SetClContext() {
+//  context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
+//  return true;
+//}
+
+// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
+//  size_t size;
+//  char *str;
+//  std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
+//
+//  if (!f.is_open()) {
+//    return false;
+//  }
+//
+//  size_t fileSize;
+//  f.seekg(0, std::fstream::end);
+//  size = fileSize = (size_t)f.tellg();
+//  f.seekg(0, std::fstream::beg);
+//  str = new char[size + 1];
+//  if (!str) {
+//    f.close();
+//    return 0;
+//  }
+//
+//  f.read(str, fileSize);
+//  f.close();
+//  str[size] = '\0';
+//  const char *source = str;
+//  size_t sourceSize[] = {strlen(source)};
+//  program_.reset(
+//      clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
+//      NULL));
+//  return true;
+//}
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "CL/cl.h"
+#include "common/enforce.h"
+#include "common/log.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_tool.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLEngine {
+ public:
+  static CLEngine *Instance();
+
+  bool Init();
+
+  std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
+    cl_int status;
+    cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
+    std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
+    CL_CHECK_ERRORS(status);
+    return std::move(context_ptr);
+  }
+
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
+      cl_context context) {
+    cl_int status;
+    cl_command_queue queue =
+        clCreateCommandQueue(context, devices_[0], 0, &status);
+    std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr(
+        queue);
+    CL_CHECK_ERRORS(status);
+    return std::move(command_queue_ptr);
+  }
+
+  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
+      cl_context context, std::string file_name) {
+    FILE *file = fopen(file_name.c_str(), "rb");
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                          file_name.c_str());
+    fseek(file, 0, SEEK_END);
+    int64_t size = ftell(file);
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    char *data = new char[size + 1];
+    size_t bytes_read = fread(data, 1, size, file);
+    data[size] = '\0';
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+
+    const char *source = data;
+    size_t sourceSize[] = {strlen(source)};
+    cl_program p =
+        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
+
+    DLOG << " cl kernel file name: " << file_name;
+    DLOG << " source size: " << sourceSize[0];
+    CL_CHECK_ERRORS(status_);
+
+    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
+
+    return std::move(program_ptr);
+  }
+
+  std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
+    cl_event event = clCreateUserEvent(context, &status_);
+    std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
+    CL_CHECK_ERRORS(status_);
+    return std::move(event_ptr);
+  }
+
+  bool BuildProgram(cl_program program) {
+    cl_int status;
+    std::string path = "-cl-fast-relaxed-math -I " +
+                       CLEngine::Instance()->GetCLPath() + "/cl_kernel";
+
+    status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
+
+    CL_CHECK_ERRORS(status);
+
+    if (status_ == CL_BUILD_PROGRAM_FAILURE) {
+      size_t log_size;
+      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
+                            CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+      char *log = reinterpret_cast<char *>(malloc(log_size));
+      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
+                            CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+      DLOG << " program build error: " << log;
+    }
+
+    if (status == CL_SUCCESS) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  cl_device_id DeviceID(int index = 0) { return devices_[index]; }
+
+  std::string GetCLPath() { return cl_path_; }
+  void setClPath(std::string cl_path) { cl_path_ = cl_path; }
+
+ private:
+  CLEngine() { initialized_ = false; }
+
+  bool SetPlatform();
+
+  bool SetClDeviceId();
+
+  bool initialized_;
+
+  cl_platform_id platform_;
+
+  cl_device_id *devices_;
+
+  cl_int status_;
+
+  std::string cl_path_;
+  std::unique_ptr<_cl_program, CLProgramDeleter> program_;
+
+  //  bool SetClContext();
+
+  //  bool SetClCommandQueue();
+
+  //  bool LoadKernelFromFile(const char *kernel_file);
+
+  //  bool BuildProgram();
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_half.cpp
+++ b/src/framework/cl/cl_half.cpp
--- a/src/framework/cl/cl_half.h
+++ b/src/framework/cl/cl_half.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstdint>
+
+namespace paddle_mobile {
+namespace framework {
+
+typedef uint16_t half_t;
+
+half_t Float2Half(float f);
+
+float Half2Float(half_t h);
+
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
+
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_helper.h
+++ b/src/framework/cl/cl_helper.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "common/log.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_image.h"
+#include "framework/cl/cl_scope.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLHelper {
+ public:
+  CLHelper() = default;
+
+  explicit CLHelper(CLScope *scope) : scope_(scope) {}
+
+  void AddKernel(const std::string &kernel_name, const std::string &file_name) {
+    DLOG << " begin add kernel ";
+    auto kernel = scope_->GetKernel(kernel_name, file_name);
+    DLOG << " add kernel ing ";
+    kernels.emplace_back(std::move(kernel));
+  }
+
+  cl_kernel KernelAt(const int index) {
+    DLOG << " kernel count: " << kernels.size();
+    return kernels[index].get();
+  }
+
+  cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
+
+  cl_context CLContext() { return scope_->Context(); }
+
+  std::vector<size_t> DefaultWorkSize(const CLImage &image) {
+    // n c h w
+    auto image_dim = image.dims();
+    if (image_dim.size() == 4) {
+      auto n = image_dim[0];
+      auto h = image_dim[2];
+      auto w = image_dim[3];
+      auto image_width = image.ImageWidth();
+      auto work_size_0 = image_width / w;
+      auto work_size_1 = w;
+      auto work_size_2 = n * h;
+      return {work_size_0, work_size_1, work_size_2};
+    } else if (image_dim.size() == 2) {
+      return {1, image.ImageWidth(), image.ImageHeight()};
+    } else if (image_dim.size() == 1) {
+      return {1, image.ImageWidth(), 1};
+    }
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
+  }
+
+ private:
+  CLScope *scope_;
+  std::vector<std::unique_ptr<_cl_kernel, CLKernelDeleter>> kernels;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image.cpp
+++ b/src/framework/cl/cl_image.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_image.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
+                     cl_command_queue commandQueue) {
+  // TODO(yangfei): need imp
+}
+
+void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
+                     cl_command_queue commandQueue) {
+  // TODO(yangfei): need imp
+}
+
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const CLImage &cl_image) {
+  int width = cl_image.ImageDims()[0];
+  int height = cl_image.ImageDims()[1];
+
+  half_t *image_data = new half_t[height * width * 4];
+  cl_int err;
+  cl_mem image = cl_image.GetCLImage();
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {width, height, 1};
+  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
+                           region, 0, 0, image_data, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(err);
+
+  float *tensor_data = new float[cl_image.numel()];
+  auto converter = cl_image.Converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
+                         cl_image.dims());
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+
+  printer << " dims: " << cl_image.dims() << "\n";
+  for (int i = 0; i < cl_image.numel(); i += stride) {
+    printer << tensor_data[i] << " ";
+  }
+
+  delete[](tensor_data);
+  delete[](image_data);
+
+  return printer;
+}
+#endif
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "CL/cl.h"
+
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/cl/cl_half.h"
+#include "framework/cl/cl_image_converter.h"
+#include "framework/cl/cl_tool.h"
+#include "framework/ddim.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLImage {
+ public:
+  CLImage() = default;
+
+  ~CLImage() {
+    if (tensor_data_ != nullptr) {
+      delete[](tensor_data_);
+    }
+
+    if (image_converter_) {
+      delete (image_converter_);
+    }
+  }
+  /*
+   * will not hold input tensor data, memcpy in this method
+   * */
+  void SetTensorData(float *tensorData, const DDim &dim) {
+    int numel = product(dim);
+    if (tensor_data_ != nullptr) {
+      delete[](tensor_data_);
+      tensor_data_ = nullptr;
+    }
+    tensor_data_ = new float[numel];
+    memcpy(tensor_data_, tensorData, numel * sizeof(float));
+    tensor_dims_ = dim;
+  }
+
+  /*
+   * need call SetTensorData first
+   *
+   * folder when one dim or two dim
+   * */
+  void InitCLImage(cl_context context, cl_command_queue command_queue) {
+    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
+                          " need call SetTensorData first");
+    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+    InitCLImage(context, command_queue, folder_converter);
+  }
+
+  void InitCLImage(cl_context context, cl_command_queue command_queue,
+                   CLImageConverterBase *converter) {
+    if (image_converter_ != nullptr) {
+      delete (image_converter_);
+    }
+
+    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
+                          " need call SetTensorData first");
+
+    DLOG << " begin init cl image ";
+    image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
+
+    half_t *image_data = new half_t[product(image_dims_) * 4];
+
+    DLOG << " convert to image";
+    converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
+    DLOG << " end convert to image";
+
+    InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
+
+    delete[](image_data);
+    delete[](tensor_data_);
+
+    command_queue_ = command_queue;
+    tensor_data_ = nullptr;
+    image_converter_ = converter;
+    initialized_ = true;
+    DLOG << " end init cl image";
+  }
+
+  void InitNImage(cl_context context, cl_command_queue command_queue) {
+    if (tensor_data_ == nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
+    }
+    CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
+    InitCLImage(context, command_queue, folder_converter);
+    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
+  }
+  void InitDWImage(cl_context context, cl_command_queue command_queue) {
+    if (tensor_data_ == nullptr) {
+      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
+    }
+    CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock();
+    InitCLImage(context, command_queue, dw_converter);
+    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
+  }
+
+  void InitEmptyImage(cl_context context, cl_command_queue command_queue,
+                      const DDim &dim) {
+    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
+                          " empty image tensor data shouldn't have value");
+
+    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
+
+    DLOG << " to get image dims ";
+    image_dims_ = folder_converter->InitImageDimInfoWith(dim);
+    DLOG << " end get image dims " << image_dims_;
+
+    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+
+    tensor_dims_ = dim;
+    command_queue_ = command_queue;
+    image_converter_ = folder_converter;
+    cl_event_ = CLEngine::Instance()->CreateEvent(context);
+    initialized_ = true;
+    DLOG << " end init cl image";
+  }
+
+  cl_mem GetCLImage() const { return cl_image_.get(); }
+
+  const DDim &ImageDims() const { return image_dims_; }
+
+  inline size_t ImageWidth() const { return image_dims_[0]; }
+
+  inline size_t ImageHeight() const { return image_dims_[1]; }
+
+  inline cl_command_queue CommandQueue() const { return command_queue_; }
+
+  /*
+   *  resize original tensor dim
+   * */
+  inline CLImage &Resize(const DDim &dims) {
+    tensor_dims_ = dims;
+    return *this;
+  }
+
+  template <typename T>
+  T *data() const {
+    if (initialized_) {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          " cl image has initialized, tensor data has been deleted, can't use "
+          "tensor data");
+    }
+    return reinterpret_cast<T *>(tensor_data_);
+  }
+
+  /*
+   *  numel of tensor dim
+   * */
+  inline int64_t numel() const { return product(tensor_dims_); }
+
+  /*
+   *  original tensor dim
+   * */
+  const DDim &dims() const { return tensor_dims_; }
+
+  cl_event GetClEvent() const { return cl_event_.get(); }
+
+  CLImageConverterBase *Converter() const { return image_converter_; }
+
+ private:
+  void InitCLImage(cl_context context, int width, int height, void *data) {
+    cl_image_format cf = {.image_channel_order = CL_RGBA,
+                          .image_channel_data_type = CL_HALF_FLOAT};
+    cl_image_desc cid = {
+        .image_type = CL_MEM_OBJECT_IMAGE2D,
+        .image_width = width,
+        .image_height = height,
+        .image_depth = 1,
+        .image_array_size = 1,
+        .image_row_pitch = 0,
+        .image_slice_pitch = 0,
+        .num_mip_levels = 0,
+        .num_samples = 0,
+        // .buffer = nullptr
+    };
+    cid.buffer = nullptr;
+    cl_int err;
+    cl_mem cl_image = clCreateImage(
+        context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
+        &cf,   // const cl_image_format *image_format
+        &cid,  // const cl_image_desc *image_desc
+        data,  // void *host_ptr
+        &err);
+    cl_image_.reset(cl_image);
+    if (err != CL_SUCCESS) {
+      CL_CHECK_ERRORS(err);
+      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
+    }
+  }
+
+  bool initialized_ = false;
+  std::unique_ptr<_cl_mem, CLMemDeleter> cl_image_;
+  std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
+  DDim tensor_dims_;
+  DDim image_dims_;
+  float *tensor_data_ = nullptr;
+  cl_context context_;
+  cl_command_queue command_queue_;
+  CLImageConverterBase *image_converter_ = nullptr;
+};
+
+void TensorToCLImage(Tensor *tensor, CLImage *image,
+                     cl_command_queue commandQueue);
+
+void CLImageToTensor(CLImage *image, Tensor *tensor,
+                     cl_command_queue commandQueue);
+
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const CLImage &image);
+#endif
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image_converter.cpp
+++ b/src/framework/cl/cl_image_converter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_image_converter.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+const DDim &CLImageConverterDefault::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  return make_ddim({width, height});
+}
+
+void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+
+  DLOG << " tensor dim " << tensor_dim;
+  DLOG << " image dim " << in_image_dim;
+
+  size_t width = in_image_dim[0];
+  size_t height = in_image_dim[1];
+
+  int w_block = width / W;
+
+  float *p = nchw;
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          if (c < C) {
+            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  int width = image_dim[0];
+  int height = image_dim[0];
+
+  float *p = tensor;
+
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+const DDim &CLImageConverterFolder::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  if (tensor_dim.size() <= 2) {
+    int tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    int width = (tdim[1] + 3) / 4;
+    int height = tdim[0];
+
+    width_of_one_block_ = width;
+    height_of_one_block_ = height;
+    c_block_ = 1;
+
+    return make_ddim({width, height});
+
+  } else {
+    size_t new_dims[] = {1, 1, 1, 1};
+    for (int j = 0; j < tensor_dim.size(); ++j) {
+      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+    }
+    size_t N, C, H, W;
+    N = new_dims[0];
+    C = new_dims[1];
+    H = new_dims[2];
+    W = new_dims[3];
+    size_t width = W * ((C + 3) / 4);
+    size_t height = H * N;
+
+    width_of_one_block_ = W;
+    height_of_one_block_ = H;
+    c_block_ = width / W;
+
+    return make_ddim({width, height});
+  }
+}
+
+void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
+                                         const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
+                        "tensor dim is not support ");
+
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.NCHWToImage(tensor, image, tensor_dim);
+
+  } else {
+    int tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+
+    DDim image_dim = InitImageDimInfoWith(tensor_dim);
+    int width = image_dim[0];
+
+    for (int h = 0; h < tdim[0]; h++) {
+      for (int w = 0; w < tdim[1]; w++) {
+        image[(h * width + w / 4) * 4 + (w % 4)] =
+            Float2Half(tensor[h * tdim[1] + w]);
+      }
+    }
+  }
+}
+
+void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+
+  } else {
+    int width = image_dim[0];
+    int height = image_dim[1];
+    int H, W;
+
+    if (tensor_dim.size() == 2) {
+      H = tensor_dim[0];
+      W = tensor_dim[1];
+    } else if (tensor_dim.size() == 1) {
+      H = 1;
+      W = tensor_dim[0];
+    }
+    float *p = tensor;
+
+    for (int h = 0; h < H; h++) {
+      for (int w = 0; w < W; w++) {
+        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
+      }
+    }
+  }
+}
+
+const DDim &CLImageConverterNWBlock::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return make_ddim({width, height});
+}
+
+void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  auto image_dim = InitImageDimInfoWith(tensor_dim);
+  float *p = tensor;
+  int N = tensor_dim[0];
+  int C = tensor_dim[1];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[1];
+  int block = image_dim[0] / tensor_dim[3];
+
+  for (int n = 0; n < block * 4; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                      w * 4 + n % 4;
+          if (n < N) {
+            image[index] = Float2Half(*p);
+            p++;
+          } else {
+            image[index] = 0.0;
+          }
+          if (index >= (width * height * 4)) {
+            DLOG << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  DLOG << " init done";
+}
+
+void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  float *p = tensor;
+  int N = tensor_dim[0];
+  int C = tensor_dim[1];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[1];
+  int block = image_dim[0] / tensor_dim[3];
+
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                      w * 4 + n % 4;
+          *p = Half2Float(image[index]);
+          p++;
+          if (index >= (width * height * 4)) {
+            DLOG << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  DLOG << " init done";
+}
+
+const DDim &CLImageConverterDWBlock::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return make_ddim({width, height});
+}
+
+void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (int j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[1];
+  C = new_dims[0];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+
+  DLOG << " tensor dim " << tensor_dim;
+  DLOG << " image dim " << in_image_dim;
+
+  size_t width = in_image_dim[0];
+  size_t height = in_image_dim[1];
+
+  int w_block = width / W;
+
+  float *p = tensor;
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          if (c < C) {
+            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = Float2Half(*p);
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
+  float *p = tensor;
+  int N = tensor_dim[1];
+  int C = tensor_dim[0];
+  int H = tensor_dim[2];
+  int W = tensor_dim[3];
+  int width = image_dim[0];
+  int height = image_dim[0];
+
+  size_t i0 = 0;
+  for (int n = 0; n < N; n++) {
+    for (int c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (int h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (int w = 0; w < W; w++) {
+          *p = Half2Float(image[i2]);
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_image_converter.h
+++ b/src/framework/cl/cl_image_converter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/cl/cl_half.h"
+#include "framework/ddim.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLImageConverterBase {
+ public:
+  virtual void NCHWToImage(float *nchw, half_t *image,
+                           const DDim &tensor_dim) = 0;
+
+  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
+                           const DDim &tensor_dim) = 0;
+  virtual const DDim &InitImageDimInfoWith(const DDim &tensor_dim) = 0;
+};
+
+class CLImageConverterDefault : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+
+class CLImageConverterFolder : public CLImageConverterBase {
+ public:
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  int GetCBlock() const { return c_block_; }
+
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+
+class CLImageConverterNWBlock : public CLImageConverterBase {
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+class CLImageConverterDWBlock : public CLImageConverterBase {
+  const DDim &InitImageDimInfoWith(const DDim &tensor_dim);
+  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
+  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim);
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "CL/cl.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/cl/cl_tool.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLScope {
+ public:
+  CLScope() {
+    CLEngine *engin = CLEngine::Instance();
+    context_ = engin->CreateContext();
+    command_queue_ = engin->CreateClCommandQueue(context_.get());
+  }
+
+  cl_command_queue CommandQueue() { return command_queue_.get(); }
+
+  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
+      const std::string &kernel_name, const std::string &file_name) {
+    DLOG << " to get program " << file_name;
+    auto program = Program(file_name);
+    DLOG << " end get program ~ ";
+    DLOG << " to create kernel: " << kernel_name;
+    std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
+        clCreateKernel(program, kernel_name.c_str(), &status_));
+    CL_CHECK_ERRORS(status_);
+    DLOG << " end create kernel ~ ";
+    return std::move(kernel);
+  }
+
+  cl_context Context() { return context_.get(); }
+
+  cl_program Program(const std::string &file_name) {
+    auto it = programs_.find(file_name);
+    if (it != programs_.end()) {
+      return it->second.get();
+    }
+
+    auto program = CLEngine::Instance()->CreateProgramWith(
+        context_.get(),
+        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
+
+    DLOG << " --- begin build program -> " << file_name << " --- ";
+    CLEngine::Instance()->BuildProgram(program.get());
+    DLOG << " --- end build program -> " << file_name << " --- ";
+
+    programs_[file_name] = std::move(program);
+
+    return programs_[file_name].get();
+  }
+
+ private:
+  cl_int status_;
+  std::unique_ptr<_cl_context, CLContextDeleter> context_;
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
+  std::unordered_map<std::string,
+                     std::unique_ptr<_cl_program, CLProgramDeleter>>
+      programs_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_tensor.h
+++ b/src/framework/cl/cl_tensor.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "CL/cl.h"
+#include "framework/cl/cl_deleter.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/tensor_base.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class CLTensor : TensorBase {
+ public:
+  CLTensor(cl_context context, cl_command_queue command_queue)
+      : context_(context), command_queue_(command_queue) {}
+
+  CLTensor() = default;
+
+  /*
+   * if init method haven't set context and command_queue, need set
+   * */
+  void SetContextAndCommandQueue(cl_context context,
+                                 cl_command_queue command_queue) {
+    context_ = context;
+    command_queue_ = command_queue;
+  }
+
+  /*! Resize the dimensions of the memory block. */
+  inline CLTensor &Resize(const DDim &dims) {
+    dims_ = dims;
+    return *this;
+  }
+
+  template <typename T>
+  inline cl_mem mutable_with_data(const T *data) {
+    int64_t size = numel() * sizeof(T);
+
+    holder_.reset(new PlaceholderImpl(
+        size, reinterpret_cast<void *>(const_cast<T *>(data)), typeid(T),
+        context_, command_queue_));
+    return reinterpret_cast<cl_mem>(holder_->ptr());
+  }
+
+  inline cl_mem mutable_data(std::type_index type) {
+    if (holder_ != nullptr) {
+      holder_->set_type(type);
+    }
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
+    int64_t size = numel() * SizeOfType(type);
+    if (holder_ == nullptr || holder_->size() < size + offset_) {
+      holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
+      offset_ = 0;
+    }
+    return reinterpret_cast<cl_mem>(holder_->ptr());
+  }
+
+  /**
+   * @brief   Return a pointer to cl buffer.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline cl_mem mutable_data() {
+    return reinterpret_cast<cl_mem>(mutable_data(typeid(T)));
+  }
+
+  /**
+   * @brief     Return a pointer to cl buffer.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline cl_mem mutable_data(DDim dims) {
+    Resize(dims);
+    return mutable_data<T>();
+  }
+
+  inline cl_mem CLBuffer() {
+    check_memory_size();
+    return reinterpret_cast<cl_mem>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()));
+  }
+
+  template <typename T>
+  inline T *Data() {
+    if (host_ptr_) {
+      delete (host_ptr_);
+      host_ptr_ = nullptr;
+    }
+    cl_mem buffer = CLBuffer();
+    host_ptr_ = new char[holder_->size()];
+    cl_int status;
+    status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0,
+                                 holder_->size(), host_ptr_, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+    return reinterpret_cast<T *>(host_ptr_);
+  }
+
+  int memorySize() { return holder_->size(); }
+
+  ~CLTensor() {
+    DLOG << "~CLTensor";
+    if (host_ptr_) {
+      DLOG << " delete host ptr ";
+      delete (host_ptr_);
+      host_ptr_ = nullptr;
+    }
+  }
+
+ private:
+  cl_context context_;
+  cl_command_queue command_queue_;
+  void *host_ptr_ = nullptr;
+
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(size_t size, void *input, std::type_index type,
+                    cl_context context, cl_command_queue command_queue)
+        : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                              size, reinterpret_cast<void *>(input), NULL)),
+          size_(size),
+          type_(type),
+          command_queue_(command_queue) {}
+
+    PlaceholderImpl(size_t size, std::type_index type, cl_context context,
+                    cl_command_queue command_queue)
+        : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
+          size_(size),
+          type_(type),
+          command_queue_(command_queue) {}
+
+    virtual size_t size() const { return size_; }
+
+    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
+
+    virtual std::type_index type() const { return type_; }
+
+    virtual void set_type(std::type_index type) { type_ = type; }
+
+    std::unique_ptr<_cl_mem, CLMemDeleter> ptr_;
+
+    size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
+
+    cl_command_queue command_queue_;
+  };
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_tool.cpp
+++ b/src/framework/cl/cl_tool.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/cl/cl_tool.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+const char *opencl_error_to_str(cl_int error) {
+#define CASE_CL_CONSTANT(NAME) \
+  case NAME:                   \
+    return #NAME;
+  // Suppose that no combinations are possible.
+  switch (error) {
+    CASE_CL_CONSTANT(CL_SUCCESS)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
+    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
+    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
+    CASE_CL_CONSTANT(CL_MAP_FAILURE)
+    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
+    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
+    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
+    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
+    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
+    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
+    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
+    CASE_CL_CONSTANT(CL_INVALID_BINARY)
+    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT)
+    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
+    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
+
+    default:
+      return "UNKNOWN ERROR CODE";
+  }
+#undef CASE_CL_CONSTANT
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/cl/cl_tool.h
+++ b/src/framework/cl/cl_tool.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "CL/cl.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+const char* opencl_error_to_str(cl_int error);
+
+#define CL_CHECK_ERRORS(ERR)                                          \
+  if (ERR != CL_SUCCESS) {                                            \
+    printf(                                                           \
+        "OpenCL error with code %s happened in file %s at line %d. "  \
+        "Exiting.\n",                                                 \
+        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
+        __LINE__);                                                    \
+  }
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -41,7 +41,6 @@ inline DataLayout StringToDataLayout(const std::string &str) {
    return DataLayout::kAnyLayout;
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
-    exit(0);
  }
 }

@@ -55,7 +54,6 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
      return "ANY_LAYOUT";
    default:
      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
-      exit(0);
      break;
  }
 }

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -42,7 +42,7 @@ struct Dim {
      : head(idx % size.head), tail(idx / size.head, size.tail) {}

  /** Construct a Dim with each dimension set to the given index */
-  Dim(int64_t idx) : head(idx), tail(idx) {}
+  explicit Dim(int64_t idx) : head(idx), tail(idx) {}

  bool operator==(const Dim<i> &o) const {
    return (head == o.head) && (tail == o.tail);
@@ -65,7 +65,7 @@ template <>
 struct Dim<0> {
  static constexpr int dimensions = 0;

-  Dim(int64_t _head) {}
+  explicit Dim(int64_t _head) {}

  Dim() {}

@@ -131,7 +131,6 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }

 template <int D>
@@ -148,7 +147,6 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  exit(0);
 }

 }  // namespace

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "io/executor.h"
+#include "framework/executor.h"
 #include <algorithm>
 #include <utility>
 #include <vector>
@@ -26,12 +26,25 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
-#include "operators/math/gemm.h"
+
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <queue>
+#include <utility>
+#include "common/threadpool.h"
+#endif
+
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_image.h"
+#endif

 namespace paddle_mobile {
+namespace framework {

+using framework::Variable;
 using framework::Variable;

+#pragma mark - executor
+
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
                             const bool use_optimize, const bool loddable)
@@ -390,13 +403,18 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
-  Executor<Dtype, P>::Ptype *output_ptr =
-      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
-  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
-  for (int j = 0; j < output_tensor->numel(); ++j) {
-    result_vector.push_back(output_ptr[j]);
+  if (output_tensor != nullptr) {
+    Executor<Dtype, P>::Ptype *output_ptr =
+        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
+    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
+    for (int j = 0; j < output_tensor->numel(); ++j) {
+      result_vector.push_back(output_ptr[j]);
+    }
+    return result_vector;
+  } else {
+    DLOG << "return  empty vector";
+    return {};
  }
-  return result_vector;
 }

 #ifdef PADDLE_MOBILE_FPGA
@@ -470,8 +488,236 @@ void Executor<Dtype, P>::Predict_To(int end) {
 }
 #endif

+#ifdef PADDLE_MOBILE_CL
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
+                                    float *tensorInput, char **data) {}
+
+template <>
+void Executor<GPU_CL, Precision::FP32>::LoadMemory(
+    const framework::VarDesc var_desc, float *tensorInput, char **data) {
+  // 1. version
+  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+
+  (*data) += sizeof(uint32_t);
+
+  // 2 Lod information
+  uint64_t *lod_level_ptr = new uint64_t();
+  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+  uint64_t lod_level = *lod_level_ptr;
+  delete lod_level_ptr;
+  (*data) += sizeof(uint64_t);
+
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+    (*data) += sizeof(uint64_t);
+    std::vector<size_t> tmp(size / sizeof(size_t));
+
+    for (int k = 0; k < tmp.size(); ++k) {
+      tmp[k] = *reinterpret_cast<size_t *>(*data);
+      (*data) += sizeof(size_t);
+    }
+  }
+
+  // 3. tensor version
+  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+  (*data) += sizeof(uint32_t);
+
+  // 4. tensor desc
+  int32_t size = *reinterpret_cast<int32_t *>(*data);
+  (*data) += sizeof(int32_t);
+
+  std::unique_ptr<char[]> buf(new char[size]);
+  for (int m = 0; m < size; ++m) {
+    buf.get()[m] = (*data)[m];
+  }
+  (*data) += (sizeof(char) * size);
+
+  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+  int memory_size = 1;
+  for (auto l : desc.Dims()) {
+    memory_size *= l;
+  }
+
+  void *memory = nullptr;
+  //            int type_size = 0;
+  //            switch (desc.DataType()) {
+  //                case framework::VARTYPE_TYPE_FP16:
+  //                    type_size = 2;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_FP32:
+  //                    type_size = 4;
+  //                    memory = tensor->mutable_data<float>();
+  //                    break;
+  //                case framework::VARTYPE_TYPE_FP64:
+  //                    type_size = 8;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_INT32:
+  //                    memory = tensor->mutable_data<int32_t>();
+  //                    type_size = 4;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_INT64:
+  //                    type_size = 8;
+  //                    break;
+  //                case framework::VARTYPE_TYPE_BOOL:
+  //                    type_size = 1;
+  //                    break;
+  //                default:
+  //                    break;
+  //            }
+  int type_size = 4;
+  memory = tensorInput;
+  if (program_.quantification) {
+    float min_value;
+    float max_value;
+
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size; n++) {
+      float value;
+      memcpy(&value, *data + n * type_size, type_size);
+      if (value < 1e-30 && value > -1e-30) {
+        static_cast<float *>(memory)[n] = 0.0;
+      } else {
+        static_cast<float *>(memory)[n] = value;
+      }
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
+  }
+}
+
+template <>
+void Executor<GPU_CL, Precision::FP32>::InitMemory() {
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        CLImage *cl_image = nullptr;
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensor>();
+          continue;
+        } else {
+          cl_image = var->template GetMutable<framework::CLImage>();
+        }
+
+        char *origin_data =
+            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
+        char *data = origin_data;
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        int numel = 1;
+        for (auto l : desc.Dims()) {
+          numel *= l;
+        }
+        DLOG << var_desc->Name();
+        float *tensorInput = static_cast<float *>(
+            paddle_mobile::memory::Alloc(sizeof(float) * numel));
+        LoadMemory(*var_desc, tensorInput, &data);
+
+        framework::DDim ddim = framework::make_ddim(desc.Dims());
+
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);
+
+        delete origin_data;
+        paddle_mobile::memory::Free(tensorInput);
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          auto cl_image = var->template GetMutable<framework::CLImage>();
+          cl_context context = program_.scope->GetCLScpoe()->Context();
+          cl_command_queue command_queue =
+              program_.scope->GetCLScpoe()->CommandQueue();
+
+          const framework::TensorDesc &desc = var_desc->Tensor_desc();
+          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
+          framework::DDim ddim = cl_image->dims();
+          DLOG << var_desc->Name();
+          cl_image->InitEmptyImage(context, command_queue, ddim);
+        }
+      }
+    }
+  }
+}
+
+template <>
+void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
+  char *origin_data = nullptr;
+  bool self_alloc = false;
+  if (program_.combined_params_buf && program_.combined_params_len) {
+    LOG(kLOG_INFO) << "use outter memory";
+    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
+  } else {
+    LOG(kLOG_INFO) << " begin init combine memory";
+    self_alloc = true;
+    origin_data = ReadFileToBuff(program_.para_path);
+  }
+  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
+  float *data = reinterpret_cast<float *>(origin_data);
+
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        CLImage *cl_image = nullptr;
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensor>();
+          continue;
+        } else {
+          cl_image = var->template GetMutable<framework::CLImage>();
+        }
+
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        framework::DDim ddim = framework::make_ddim(desc.Dims());
+
+        int numel = 1;
+        for (int i = 0; i < ddim.size(); i++) {
+          numel = numel * ddim[i];
+        }
+        float *tensorInput = static_cast<float *>(
+            paddle_mobile::memory::Alloc(sizeof(float) * numel));
+        LoadMemory(*var_desc, tensorInput, &origin_data);
+
+        // has not init
+        cl_image->SetTensorData(tensorInput, ddim);
+
+        paddle_mobile::memory::Free(tensorInput);
+      } else {
+        auto cl_image = var->template GetMutable<framework::CLImage>();
+        cl_context context = program_.scope->GetCLScpoe()->Context();
+        cl_command_queue command_queue =
+            program_.scope->GetCLScpoe()->CommandQueue();
+        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        framework::DDim ddim = cl_image->dims();
+        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        cl_image->InitEmptyImage(context, command_queue, ddim);
+      }
+    }
+  }
+  if (self_alloc) {
+    delete data;
+  }
+  LOG(kLOG_INFO) << " end init combine memory ";
+}
+
+#endif
+
 template class Executor<CPU, Precision::FP32>;
-template class Executor<GPU_MALI, Precision::FP32>;
+
 template class Executor<FPGA, Precision::FP32>;

+template class Executor<GPU_CL, Precision::FP32>;
+
+template class Executor<GPU_MALI, Precision::FP32>;
+
+}  // namespace framework
 }  // namespace paddle_mobile
--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "framework/tensor.h"

 namespace paddle_mobile {
+namespace framework {

 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
@@ -79,7 +80,10 @@ class Executor {
  void LoadMemory(void **data,
                  const std::shared_ptr<framework::VarDesc> var_desc,
                  framework::LoDTensor *tensor);
-
+#ifdef PADDLE_MOBILE_CL
+  void LoadMemory(const framework::VarDesc var_desc, float *tensorInput,
+                  char **data);
+#endif
  framework::Program<Dtype> program_;
  int batch_size_ = 1;
  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
@@ -97,4 +101,5 @@ class Executor {
  bool loddable_ = false;
 };

+}  // namespace framework
 }  // namespace paddle_mobile
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
--- a/src/framework/loader.h
+++ b/src/framework/loader.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "common/types.h"
+#include "framework/program/program.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
+ public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  const Program<Dtype, P> Load(const std::string &dirname,
+                               bool optimize = false,
+                               bool quantification = false,
+                               bool can_add_split = false);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const Program<Dtype, P> Load(const std::string &model_path,
+                               const std::string &para_path,
+                               bool optimize = false,
+                               bool quantification = false);
+
+  const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
+                                             const uint8_t *model_buf,
+                                             size_t combined_params_len,
+                                             uint8_t *combined_params_buf,
+                                             bool optimize = false,
+                                             bool quantification = false);
+
+ private:
+  const Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                      bool optimize = false,
+                                      bool quantification = false,
+                                      bool can_add_split = false);
+
+  void InitMemoryFromProgram(
+      const std::shared_ptr<ProgramDesc> &originProgramDesc,
+      const std::shared_ptr<Scope> &scope);
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -14,8 +14,10 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
 #include <tuple>
+
 #include "common/log.h"
 #include "common/type_define.h"
 #include "framework/op_info.h"
@@ -120,5 +122,8 @@ class OpRegistry {
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);

+#define REGISTER_OPERATOR_CL(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL);
+
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -56,7 +56,7 @@ template <typename Dtype>
 void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}

 template <typename Dtype>
-void OperatorBase<Dtype>::Run() const {
+void OperatorBase<Dtype>::Run() {
  RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
  DLOG << "-------------" << type_ << "----------------------------";
@@ -84,9 +84,57 @@ void OperatorBase<Dtype>::Run() const {
 #endif
 }

+#ifdef PADDLE_MOBILE_CL
+template <>
+void OperatorBase<GPU_CL>::Run() {
+  RunImpl();
+#ifdef PADDLE_MOBILE_DEBUG
+  DLOG << "-------------" << type_ << "----------------------------";
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    auto var_vec_in = inputs_.at(key);
+    for (int i = 0; i < var_vec_in.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_in[i]);
+      if (vari->IsInitialized()) {
+        if (type_ == "feed") {
+          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+          if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+        } else {
+          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " input- " << key << "=" << *cl_image;
+          }
+        }
+      }
+    }
+  }
+  for (const auto key : GetOutKeys()) {
+    auto var_vec_out = outputs_.at(key);
+    for (int i = 0; i < var_vec_out.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_out[i]);
+      if (vari->IsInitialized()) {
+        if (type_ == "fetch") {
+          Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+          if (tensor) {
+            DLOG << type_ << " output- " << key << "=" << *tensor;
+          }
+        } else {
+          CLImage *cl_image = vari->template GetMutable<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " output- " << key << "=" << *cl_image;
+          }
+        }
+      }
+    }
+  }
+#endif
+}
+#endif
+
 template class OperatorBase<CPU>;
 template class OperatorBase<FPGA>;
 template class OperatorBase<GPU_MALI>;
+template class OperatorBase<GPU_CL>;

 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "framework/program/program_desc.h"
 #include "framework/scope.h"

+#include <string>
+
 namespace paddle_mobile {
 namespace framework {

@@ -32,7 +34,7 @@ class Program {
  bool combined = false;
  bool quantification = false;
  size_t combined_params_len;
-  const uint8_t *combined_params_buf;
+  uint8_t *combined_params_buf;
 };

 }  // namespace framework

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -15,8 +15,14 @@ limitations under the License. */
 #pragma once

 #include <list>
+#include <string>
 #include <unordered_map>
-#include "variable.h"
+#include <vector>
+
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_scope.h"
+#endif
+#include "framework/variable.h"

 namespace paddle_mobile {
 namespace framework {
@@ -33,6 +39,10 @@ class Scope {
      delete kid;
    }
    kids_.clear();
+
+#ifdef PADDLE_MOBILE_CL
+    delete cl_scope_;
+#endif
  }

  Scope &NewScope() const;
@@ -72,6 +82,10 @@ class Scope {

  Variable *FindVarLocally(const std::string &name) const;

+#ifdef PADDLE_MOBILE_CL
+  CLScope *GetCLScpoe() { return cl_scope_; }
+#endif
+
 private:
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const *parent) : parent_(parent) {}
@@ -79,6 +93,10 @@ class Scope {
  mutable std::unordered_map<std::string, Variable *> vars_;
  mutable std::list<Scope *> kids_;
  Scope const *parent_{nullptr};
+
+#ifdef PADDLE_MOBILE_CL
+  CLScope *cl_scope_ = new CLScope();
+#endif
 };
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
--- a/src/framework/tensor_base.h
+++ b/src/framework/tensor_base.h
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -29,7 +29,9 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
-
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile_->SetCLPath(config.cl_path);
+#endif
  if (config.memory_pack.from_memory) {
    DLOG << "load from memory!";
    paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
@@ -126,6 +128,8 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
      x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
    } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
      x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kGPU_CL) {
+      x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config));
    } else {
      LOG(kLOG_ERROR) << "unsupport device type!";
      return nullptr;

--- a/src/ios_io/PaddleMobileCPU.h
+++ b/src/ios_io/PaddleMobileCPU.h
--- a/src/ios_io/PaddleMobileCPU.mm
+++ b/src/ios_io/PaddleMobileCPU.mm
--- a/src/jni/PML.java
+++ b/src/jni/PML.java
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
--- a/src/operators/fill_constant_op.cpp
+++ b/src/operators/fill_constant_op.cpp
--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
--- a/src/operators/fusion_conv_add_op.h
+++ b/src/operators/fusion_conv_add_op.h
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
--- a/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
--- a/src/operators/kernel/arm/crf_kernel.cpp
+++ b/src/operators/kernel/arm/crf_kernel.cpp
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
--- a/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp
--- a/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp
--- a/src/operators/kernel/arm/feed_kernel.cpp
+++ b/src/operators/kernel/arm/feed_kernel.cpp
--- a/src/operators/kernel/arm/fetch_kernel.cpp
+++ b/src/operators/kernel/arm/fetch_kernel.cpp
--- a/src/operators/kernel/arm/flatten_kernel.cpp
+++ b/src/operators/kernel/arm/flatten_kernel.cpp
--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
--- a/src/operators/kernel/arm/gru_kernel.cpp
+++ b/src/operators/kernel/arm/gru_kernel.cpp
--- a/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
--- a/src/operators/kernel/arm/lookup_kernel.cpp
+++ b/src/operators/kernel/arm/lookup_kernel.cpp
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
--- a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
+++ b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
--- a/src/operators/kernel/arm/prelu_kernel.cpp
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
--- a/src/operators/kernel/arm/reshape2_kernel.cpp
+++ b/src/operators/kernel/arm/reshape2_kernel.cpp
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
--- a/src/operators/kernel/arm/resize_kernel.cpp
+++ b/src/operators/kernel/arm/resize_kernel.cpp
--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
--- a/src/operators/kernel/arm/shape_kernel.cpp
+++ b/src/operators/kernel/arm/shape_kernel.cpp
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
--- a/src/operators/kernel/arm/slice_kernel.cpp
+++ b/src/operators/kernel/arm/slice_kernel.cpp
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
--- a/src/operators/kernel/arm/split_kernel.cpp
+++ b/src/operators/kernel/arm/split_kernel.cpp
--- a/src/operators/kernel/arm/sum_kernel.cpp
+++ b/src/operators/kernel/arm/sum_kernel.cpp
--- a/src/operators/kernel/arm/transpose2_kernel.cpp
+++ b/src/operators/kernel/arm/transpose2_kernel.cpp
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
--- a/src/operators/kernel/bilinear_interp_kernel.h
+++ b/src/operators/kernel/bilinear_interp_kernel.h
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
--- a/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
--- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ b/src/operators/kernel/cl/cl_kernel/cl_common.h
--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_relu_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
--- a/src/operators/kernel/cl/cl_kernel/relu.cl
+++ b/src/operators/kernel/cl/cl_kernel/relu.cl
--- a/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ b/src/operators/kernel/cl/cl_kernel/reshape.cl
--- a/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ b/src/operators/kernel/cl/cl_kernel/softmax.cl
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
--- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
--- a/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
--- a/src/operators/kernel/cl/pool_kernel.cpp
+++ b/src/operators/kernel/cl/pool_kernel.cpp
--- a/src/operators/kernel/cl/relu_kernel.cpp
+++ b/src/operators/kernel/cl/relu_kernel.cpp
--- a/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
--- a/src/operators/kernel/cl/softmax_kernel.cpp
+++ b/src/operators/kernel/cl/softmax_kernel.cpp
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
--- a/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
--- a/src/operators/kernel/conv_bn_kernel.h
+++ b/src/operators/kernel/conv_bn_kernel.h
--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
--- a/src/operators/kernel/conv_transpose_kernel.h
+++ b/src/operators/kernel/conv_transpose_kernel.h
--- a/src/operators/kernel/crf_kernel.h
+++ b/src/operators/kernel/crf_kernel.h
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
--- a/src/operators/kernel/dequantize_kernel.h
+++ b/src/operators/kernel/dequantize_kernel.h
--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
--- a/src/operators/kernel/elementwise_add_relu_kernel.h
+++ b/src/operators/kernel/elementwise_add_relu_kernel.h
--- a/src/operators/kernel/elementwise_mul_kernel.h
+++ b/src/operators/kernel/elementwise_mul_kernel.h
--- a/src/operators/kernel/elementwise_sub_kernel.h
+++ b/src/operators/kernel/elementwise_sub_kernel.h
--- a/src/operators/kernel/fc_relu_kernel.h
+++ b/src/operators/kernel/fc_relu_kernel.h
--- a/src/operators/kernel/feed_kernel.h
+++ b/src/operators/kernel/feed_kernel.h
--- a/src/operators/kernel/fetch_kernel.h
+++ b/src/operators/kernel/fetch_kernel.h
--- a/src/operators/kernel/flatten_kernel.h
+++ b/src/operators/kernel/flatten_kernel.h
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
--- a/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/concat_kernel.cpp
--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/dropout_kernel.cpp
--- a/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fc_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
--- a/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fetch_kernel.cpp
--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/pool_kernel.cpp
--- a/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/slice_kernel.cpp
--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
--- a/src/operators/kernel/gru_kernel.h
+++ b/src/operators/kernel/gru_kernel.h
--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
--- a/src/operators/kernel/lookup_kernel.h
+++ b/src/operators/kernel/lookup_kernel.h
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
--- a/src/operators/kernel/mali/feed_kernel.cpp
+++ b/src/operators/kernel/mali/feed_kernel.cpp
--- a/src/operators/kernel/mali/fetch_kernel.cpp
+++ b/src/operators/kernel/mali/fetch_kernel.cpp
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
--- a/src/operators/kernel/polygon_box_transform_kernel.h
+++ b/src/operators/kernel/polygon_box_transform_kernel.h
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
--- a/src/operators/kernel/prelu_kernel.h
+++ b/src/operators/kernel/prelu_kernel.h
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
--- a/src/operators/kernel/quantize_kernel.h
+++ b/src/operators/kernel/quantize_kernel.h
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
--- a/src/operators/kernel/reshape2_kernel.h
+++ b/src/operators/kernel/reshape2_kernel.h
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
--- a/src/operators/kernel/resize_kernel.h
+++ b/src/operators/kernel/resize_kernel.h
--- a/src/operators/kernel/scale_kernel.h
+++ b/src/operators/kernel/scale_kernel.h
--- a/src/operators/kernel/shape_kernel.h
+++ b/src/operators/kernel/shape_kernel.h
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
--- a/src/operators/kernel/split_kernel.h
+++ b/src/operators/kernel/split_kernel.h
--- a/src/operators/kernel/sum_kernel.h
+++ b/src/operators/kernel/sum_kernel.h
--- a/src/operators/kernel/transpose2_kernel.h
+++ b/src/operators/kernel/transpose2_kernel.h
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
--- a/test/fpga/test_concat_op.cpp
+++ b/test/fpga/test_concat_op.cpp
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
--- a/test/framework/test_load_memory_inference_api.cpp
+++ b/test/framework/test_load_memory_inference_api.cpp
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
--- a/test/net/test_yologpu.cpp
+++ b/test/net/test_yologpu.cpp
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
--- a/test/operators/test_conv_add_relu_op.cpp
+++ b/test/operators/test_conv_add_relu_op.cpp
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
--- a/test/operators/test_prelu_op.cpp
+++ b/test/operators/test_prelu_op.cpp
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
--- a/test/operators/test_reshape2_op.cpp
+++ b/test/operators/test_reshape2_op.cpp
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
--- a/test/operators/test_resize_op.cpp
+++ b/test/operators/test_resize_op.cpp
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
--- a/test/operators/test_transpose2_op.cpp
+++ b/test/operators/test_transpose2_op.cpp
--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
--- a/test/test_helper.h
+++ b/test/test_helper.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d10.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_d3d11.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_dx9_media_sharing_intel.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_egl.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_ext_intel.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_gl_ext.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_platform.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_va_api_media_sharing_intel.h
--- a/third_party/opencl/OpenCL-Headers/CL/cl_version.h
+++ b/third_party/opencl/OpenCL-Headers/CL/cl_version.h
--- a/third_party/opencl/OpenCL-Headers/CL/opencl.h
+++ b/third_party/opencl/OpenCL-Headers/CL/opencl.h
--- a/third_party/opencl/OpenCL-Headers/LICENSE
+++ b/third_party/opencl/OpenCL-Headers/LICENSE
--- a/third_party/opencl/OpenCL-Headers/README.md
+++ b/third_party/opencl/OpenCL-Headers/README.md
--- a/tools/android-debug-script/push2android.sh
+++ b/tools/android-debug-script/push2android.sh
--- a/tools/build.sh
+++ b/tools/build.sh
--- a/tools/op.cmake
+++ b/tools/op.cmake
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook